From aa8aa9cd558ad0a0323db284ec7db5810d2abfa8 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 18 Jul 2024 09:05:50 -0400 Subject: [PATCH 001/248] Python wrapper classes for all user interfaces (#750) * Expose missing functions to python * Initial commit for creating wrapper classes and functions for all user facing python features * Remove extra level of python path that is no longer required * Move import to only happen for type checking for hints * Comment out classes from __all__ in the top level that are not currently exposed. * Add license comments * Add missing import * Functions now only has one level of depth * Applying google docstring formatting * Addressing PR request to add google formatted docstrings * Small docstring for ruff * Linting * Add docstring format checking to pre-commit stage * Set explicit return types on UDFs * Add options of passing either a path or a string * Switch to google docstring style * Update unit tests to include registering via path or string * Add py.typed file * Resolve deprecation warnings in unit tests * Add path to unit test * Expose an option in write_csv to include header and add unit test * Update write_parquet unit test to include paths or strings * Add unit test for write_json * Add unit test for substrait serialization to a file * Add unit tests for runtime config * Setting return type to typing_extensions.Self per PR recommendation * Correcting __next__ to not return None since it will raise an exception instead. * Add optiona parameter of decimal places to round and add unit test * Improve docstrings * Set default to None instead of empty dict * User request to allow passing multiple arguments to filter() * Enhance Expr comparison operators to accept any python value and attempt to convert it to a literal * Expose overlay and add unit test * Allow select() to take either str for column names or a full expr * Update comments on regexp and add unit tests * Remove TODO markings no longer applicable * Update udf documentation * Docstring formatting * Updating docstring formatting * Updating docstring formatting * Updating docstring formatting * Updating docstring formatting * Updating docstring formatting * Cleaning up docstring line lengths * Add pre-commit check of docstring line length * Do not emit doc entry for __init__ of some classes * Correct errors on code blocks generating in sphinx * Resolve conflict with * Add license info to py.typed * Clean up some docstring too long errors in CI * Correct ruff complain in unit tests * Temporarily install google test to get clippy to pass * Adding gmock to build step due to upstream error * Add type_extensions to conda meta file * Small comment suggestions from PR --- .github/workflows/build.yml | 4 + .github/workflows/test.yaml | 4 + benchmarks/db-benchmark/join-datafusion.py | 3 +- conda/recipes/meta.yaml | 1 + docs/source/api/functions.rst | 2 +- docs/source/conf.py | 21 + examples/substrait.py | 15 +- examples/tpch/_tests.py | 5 +- pyproject.toml | 18 + python/datafusion/__init__.py | 170 +-- python/datafusion/catalog.py | 76 + python/datafusion/common.py | 2 +- python/datafusion/context.py | 1003 +++++++++++++ python/datafusion/dataframe.py | 527 +++++++ python/datafusion/expr.py | 414 +++++- python/datafusion/functions.py | 1471 +++++++++++++++++++- python/datafusion/input/__init__.py | 5 + python/datafusion/input/base.py | 17 +- python/datafusion/input/location.py | 10 +- python/datafusion/object_store.py | 2 +- python/datafusion/py.typed | 16 + python/datafusion/record_batch.py | 74 + python/datafusion/substrait.py | 168 ++- python/datafusion/tests/conftest.py | 3 +- python/datafusion/tests/test_context.py | 87 +- python/datafusion/tests/test_dataframe.py | 90 +- python/datafusion/tests/test_expr.py | 25 +- python/datafusion/tests/test_functions.py | 66 +- python/datafusion/tests/test_imports.py | 15 +- python/datafusion/tests/test_sql.py | 61 +- python/datafusion/tests/test_substrait.py | 38 +- python/datafusion/tests/test_udaf.py | 4 +- python/datafusion/udf.py | 248 ++++ src/common.rs | 1 + src/common/data_type.rs | 2 +- src/dataframe.rs | 29 +- src/expr.rs | 1 + src/functions.rs | 27 +- src/lib.rs | 2 + src/substrait.rs | 2 +- 40 files changed, 4441 insertions(+), 288 deletions(-) create mode 100644 python/datafusion/catalog.py create mode 100644 python/datafusion/context.py create mode 100644 python/datafusion/dataframe.py create mode 100644 python/datafusion/py.typed create mode 100644 python/datafusion/record_batch.py create mode 100644 python/datafusion/udf.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 350be46d5..a37abe53a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -89,6 +89,10 @@ jobs: name: python-wheel-license path: . + # To remove once https://github.com/MaterializeInc/rust-protobuf-native/issues/20 is resolved + - name: Install gtest + uses: MarkusJx/googletest-installer@v1.1 + - name: Install Protoc uses: arduino/setup-protoc@v1 with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4f47dc984..c9a365bbb 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -55,6 +55,10 @@ jobs: version: '3.20.2' repo-token: ${{ secrets.GITHUB_TOKEN }} + # To remove once https://github.com/MaterializeInc/rust-protobuf-native/issues/20 is resolved + - name: Install gtest + uses: MarkusJx/googletest-installer@v1.1 + - name: Setup Python uses: actions/setup-python@v5 with: diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py index 4d59c7dc2..811ad8707 100755 --- a/benchmarks/db-benchmark/join-datafusion.py +++ b/benchmarks/db-benchmark/join-datafusion.py @@ -74,7 +74,8 @@ def ans_shape(batches): ctx = df.SessionContext() print(ctx) -# TODO we should be applying projections to these table reads to crete relations of different sizes +# TODO we should be applying projections to these table reads to create relations +# of different sizes x_data = pacsv.read_csv( src_jn_x, convert_options=pacsv.ConvertOptions(auto_dict_encode=True) diff --git a/conda/recipes/meta.yaml b/conda/recipes/meta.yaml index 72ac7f501..b0784253a 100644 --- a/conda/recipes/meta.yaml +++ b/conda/recipes/meta.yaml @@ -51,6 +51,7 @@ requirements: run: - python - pyarrow >=11.0.0 + - typing_extensions test: imports: diff --git a/docs/source/api/functions.rst b/docs/source/api/functions.rst index 958606df2..6f10d826e 100644 --- a/docs/source/api/functions.rst +++ b/docs/source/api/functions.rst @@ -24,4 +24,4 @@ Functions .. autosummary:: :toctree: ../generated/ - functions.functions + functions diff --git a/docs/source/conf.py b/docs/source/conf.py index c0da8b2cc..308069b6c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +"""Documenation generation.""" + # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full @@ -78,6 +80,25 @@ autosummary_generate = True + +def autodoc_skip_member(app, what, name, obj, skip, options): + exclude_functions = "__init__" + exclude_classes = ("Expr", "DataFrame") + + class_name = "" + if hasattr(obj, "__qualname__"): + if obj.__qualname__ is not None: + class_name = obj.__qualname__.split(".")[0] + + should_exclude = name in exclude_functions and class_name in exclude_classes + + return True if should_exclude else None + + +def setup(app): + app.connect("autodoc-skip-member", autodoc_skip_member) + + # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for diff --git a/examples/substrait.py b/examples/substrait.py index 23cd74649..fd4d0f9ca 100644 --- a/examples/substrait.py +++ b/examples/substrait.py @@ -18,16 +18,13 @@ from datafusion import SessionContext from datafusion import substrait as ss - # Create a DataFusion context ctx = SessionContext() # Register table with context ctx.register_csv("aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv") -substrait_plan = ss.substrait.serde.serialize_to_plan( - "SELECT * FROM aggregate_test_data", ctx -) +substrait_plan = ss.Serde.serialize_to_plan("SELECT * FROM aggregate_test_data", ctx) # type(substrait_plan) -> # Encode it to bytes @@ -38,17 +35,15 @@ # Alternative serialization approaches # type(substrait_bytes) -> , at this point the bytes can be distributed to file, network, etc safely # where they could subsequently be deserialized on the receiving end. -substrait_bytes = ss.substrait.serde.serialize_bytes( - "SELECT * FROM aggregate_test_data", ctx -) +substrait_bytes = ss.Serde.serialize_bytes("SELECT * FROM aggregate_test_data", ctx) # Imagine here bytes would be read from network, file, etc ... for example brevity this is omitted and variable is simply reused # type(substrait_plan) -> -substrait_plan = ss.substrait.serde.deserialize_bytes(substrait_bytes) +substrait_plan = ss.Serde.deserialize_bytes(substrait_bytes) # type(df_logical_plan) -> -df_logical_plan = ss.substrait.consumer.from_substrait_plan(ctx, substrait_plan) +df_logical_plan = ss.Consumer.from_substrait_plan(ctx, substrait_plan) # Back to Substrait Plan just for demonstration purposes # type(substrait_plan) -> -substrait_plan = ss.substrait.producer.to_substrait_plan(df_logical_plan) +substrait_plan = ss.Producer.to_substrait_plan(df_logical_plan) diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index 8804041b1..903b53548 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -96,8 +96,9 @@ def test_tpch_query_vs_answer_file(query_code: str, answer_file: str): module = import_module(query_code) df = module.df - # Treat q17 as a special case. The answer file does not match the spec. Running at - # scale factor 1, we have manually verified this result does match the expected value. + # Treat q17 as a special case. The answer file does not match the spec. + # Running at scale factor 1, we have manually verified this result does + # match the expected value. if answer_file == "q17": return check_q17(df) diff --git a/pyproject.toml b/pyproject.toml index b706065a4..a18ef0e5e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,3 +64,21 @@ exclude = [".github/**", "ci/**", ".asf.yaml"] # Require Cargo.lock is up to date locked = true features = ["substrait"] + +# Enable docstring linting using the google style guide +[tool.ruff.lint] +select = ["E4", "E7", "E9", "F", "D", "W"] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.pycodestyle] +max-doc-length = 88 + +# Disable docstring checking for these directories +[tool.ruff.lint.per-file-ignores] +"python/datafusion/tests/*" = ["D"] +"examples/*" = ["D", "W505"] +"dev/*" = ["D"] +"benchmarks/*" = ["D", "F"] +"docs/*" = ["D"] diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 846b1a459..59bc8e306 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -15,80 +15,44 @@ # specific language governing permissions and limitations # under the License. -from abc import ABCMeta, abstractmethod -from typing import List +"""DataFusion python package. + +This is a Python library that binds to Apache Arrow in-memory query engine DataFusion. +See https://datafusion.apache.org/python for more information. +""" try: import importlib.metadata as importlib_metadata except ImportError: import importlib_metadata -import pyarrow as pa - -from ._internal import ( - AggregateUDF, - Config, - DataFrame, +from .context import ( SessionContext, SessionConfig, RuntimeConfig, - ScalarUDF, SQLOptions, ) +# The following imports are okay to remain as opaque to the user. +from ._internal import Config + +from .udf import ScalarUDF, AggregateUDF, Accumulator + from .common import ( DFSchema, ) +from .dataframe import DataFrame + from .expr import ( - Alias, - Analyze, Expr, - Filter, - Limit, - Like, - ILike, - Projection, - SimilarTo, - ScalarVariable, - Sort, - TableScan, - Not, - IsNotNull, - IsTrue, - IsFalse, - IsUnknown, - IsNotTrue, - IsNotFalse, - IsNotUnknown, - Negative, - InList, - Exists, - Subquery, - InSubquery, - ScalarSubquery, - GroupingSet, - Placeholder, - Case, - Cast, - TryCast, - Between, - Explain, - CreateMemoryTable, - SubqueryAlias, - Extension, - CreateView, - Distinct, - DropTable, - Repartition, - Partitioning, - Window, WindowFrame, ) __version__ = importlib_metadata.version(__name__) __all__ = [ + "Accumulator", "Config", "DataFrame", "SessionContext", @@ -96,78 +60,16 @@ "SQLOptions", "RuntimeConfig", "Expr", - "AggregateUDF", "ScalarUDF", - "Window", "WindowFrame", "column", "literal", - "TableScan", - "Projection", "DFSchema", - "DFField", - "Analyze", - "Sort", - "Limit", - "Filter", - "Like", - "ILike", - "SimilarTo", - "ScalarVariable", - "Alias", - "Not", - "IsNotNull", - "IsTrue", - "IsFalse", - "IsUnknown", - "IsNotTrue", - "IsNotFalse", - "IsNotUnknown", - "Negative", - "ScalarFunction", - "BuiltinScalarFunction", - "InList", - "Exists", - "Subquery", - "InSubquery", - "ScalarSubquery", - "GroupingSet", - "Placeholder", - "Case", - "Cast", - "TryCast", - "Between", - "Explain", - "SubqueryAlias", - "Extension", - "CreateMemoryTable", - "CreateView", - "Distinct", - "DropTable", - "Repartition", - "Partitioning", ] -class Accumulator(metaclass=ABCMeta): - @abstractmethod - def state(self) -> List[pa.Scalar]: - pass - - @abstractmethod - def update(self, values: pa.Array) -> None: - pass - - @abstractmethod - def merge(self, states: pa.Array) -> None: - pass - - @abstractmethod - def evaluate(self) -> pa.Scalar: - pass - - -def column(value): +def column(value: str): + """Create a column expression.""" return Expr.column(value) @@ -175,46 +77,12 @@ def column(value): def literal(value): - if not isinstance(value, pa.Scalar): - value = pa.scalar(value) + """Create a literal expression.""" return Expr.literal(value) lit = literal +udf = ScalarUDF.udf -def udf(func, input_types, return_type, volatility, name=None): - """ - Create a new User Defined Function - """ - if not callable(func): - raise TypeError("`func` argument must be callable") - if name is None: - name = func.__qualname__.lower() - return ScalarUDF( - name=name, - func=func, - input_types=input_types, - return_type=return_type, - volatility=volatility, - ) - - -def udaf(accum, input_type, return_type, state_type, volatility, name=None): - """ - Create a new User Defined Aggregate Function - """ - if not issubclass(accum, Accumulator): - raise TypeError("`accum` must implement the abstract base class Accumulator") - if name is None: - name = accum.__qualname__.lower() - if isinstance(input_type, pa.lib.DataType): - input_type = [input_type] - return AggregateUDF( - name=name, - accumulator=accum, - input_type=input_type, - return_type=return_type, - state_type=state_type, - volatility=volatility, - ) +udaf = AggregateUDF.udaf diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py new file mode 100644 index 000000000..cec0be764 --- /dev/null +++ b/python/datafusion/catalog.py @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Data catalog providers.""" + +from __future__ import annotations + +import datafusion._internal as df_internal + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import pyarrow + + +class Catalog: + """DataFusion data catalog.""" + + def __init__(self, catalog: df_internal.Catalog) -> None: + """This constructor is not typically called by the end user.""" + self.catalog = catalog + + def names(self) -> list[str]: + """Returns the list of databases in this catalog.""" + return self.catalog.names() + + def database(self, name: str = "public") -> Database: + """Returns the database with the given `name` from this catalog.""" + return Database(self.catalog.database(name)) + + +class Database: + """DataFusion Database.""" + + def __init__(self, db: df_internal.Database) -> None: + """This constructor is not typically called by the end user.""" + self.db = db + + def names(self) -> set[str]: + """Returns the list of all tables in this database.""" + return self.db.names() + + def table(self, name: str) -> Table: + """Return the table with the given `name` from this database.""" + return Table(self.db.table(name)) + + +class Table: + """DataFusion table.""" + + def __init__(self, table: df_internal.Table) -> None: + """This constructor is not typically called by the end user.""" + self.table = table + + def schema(self) -> pyarrow.Schema: + """Returns the schema associated with this table.""" + return self.table.schema() + + @property + def kind(self) -> str: + """Returns the kind of table.""" + return self.table.kind() diff --git a/python/datafusion/common.py b/python/datafusion/common.py index dd56640a4..2351845b8 100644 --- a/python/datafusion/common.py +++ b/python/datafusion/common.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Common data types used throughout the DataFusion project.""" from ._internal import common diff --git a/python/datafusion/context.py b/python/datafusion/context.py new file mode 100644 index 000000000..a717db106 --- /dev/null +++ b/python/datafusion/context.py @@ -0,0 +1,1003 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Session Context and it's associated configuration.""" + +from __future__ import annotations + +from ._internal import SessionConfig as SessionConfigInternal +from ._internal import RuntimeConfig as RuntimeConfigInternal +from ._internal import SQLOptions as SQLOptionsInternal +from ._internal import SessionContext as SessionContextInternal +from ._internal import LogicalPlan, ExecutionPlan + +from datafusion._internal import AggregateUDF +from datafusion.catalog import Catalog, Table +from datafusion.dataframe import DataFrame +from datafusion.expr import Expr +from datafusion.record_batch import RecordBatchStream +from datafusion.udf import ScalarUDF + +from typing import Any, TYPE_CHECKING +from typing_extensions import deprecated + +if TYPE_CHECKING: + import pyarrow + import pandas + import polars + import pathlib + + +class SessionConfig: + """Session configuration options.""" + + def __init__(self, config_options: dict[str, str] | None = None) -> None: + """Create a new `SessionConfig` with the given configuration options. + + Args: + config_options: Configuration options. + """ + self.config_internal = SessionConfigInternal(config_options) + + def with_create_default_catalog_and_schema( + self, enabled: bool = True + ) -> SessionConfig: + """Control if the default catalog and schema will be automatically created. + + Args: + enabled: Whether the default catalog and schema will be + automatically created. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = ( + self.config_internal.with_create_default_catalog_and_schema(enabled) + ) + return self + + def with_default_catalog_and_schema( + self, catalog: str, schema: str + ) -> SessionConfig: + """Select a name for the default catalog and shcema. + + Args: + catalog: Catalog name. + schema: Schema name. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_default_catalog_and_schema( + catalog, schema + ) + return self + + def with_information_schema(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the inclusion of `information_schema` virtual tables. + + Args: + enabled: Whether to include `information_schema` virtual tables. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_information_schema(enabled) + return self + + def with_batch_size(self, batch_size: int) -> SessionConfig: + """Customize batch size. + + Args: + batch_size: Batch size. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_batch_size(batch_size) + return self + + def with_target_partitions(self, target_partitions: int) -> SessionConfig: + """Customize the number of target partitions for query execution. + + Increasing partitions can increase concurrency. + + Args: + target_partitions: Number of target partitions. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_target_partitions( + target_partitions + ) + return self + + def with_repartition_aggregations(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for aggregations. + + Enabling this improves parallelism. + + Args: + enabled: Whether to use repartitioning for aggregations. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_aggregations( + enabled + ) + return self + + def with_repartition_joins(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for joins to improve parallelism. + + Args: + enabled: Whether to use repartitioning for joins. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_joins(enabled) + return self + + def with_repartition_windows(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for window functions. + + This may improve parallelism. + + Args: + enabled: Whether to use repartitioning for window functions. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_windows(enabled) + return self + + def with_repartition_sorts(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for window functions. + + This may improve parallelism. + + Args: + enabled: Whether to use repartitioning for window functions. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_sorts(enabled) + return self + + def with_repartition_file_scans(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for file scans. + + Args: + enabled: Whether to use repartitioning for file scans. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_file_scans(enabled) + return self + + def with_repartition_file_min_size(self, size: int) -> SessionConfig: + """Set minimum file range size for repartitioning scans. + + Args: + size: Minimum file range size. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_file_min_size(size) + return self + + def with_parquet_pruning(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of pruning predicate for parquet readers. + + Pruning predicates will enable the reader to skip row groups. + + Args: + enabled: Whether to use pruning predicate for parquet readers. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_parquet_pruning(enabled) + return self + + def set(self, key: str, value: str) -> SessionConfig: + """Set a configuration option. + + Args: + key: Option key. + value: Option value. + + Returns: + A new `SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.set(key, value) + return self + + +class RuntimeConfig: + """Runtime configuration options.""" + + def __init__(self) -> None: + """Create a new `RuntimeConfig` with default values.""" + self.config_internal = RuntimeConfigInternal() + + def with_disk_manager_disabled(self) -> RuntimeConfig: + """Disable the disk manager, attempts to create temporary files will error. + + Returns: + A new `RuntimeConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_disk_manager_disabled() + return self + + def with_disk_manager_os(self) -> RuntimeConfig: + """Use the operating system's temporary directory for disk manager. + + Returns: + A new `RuntimeConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_disk_manager_os() + return self + + def with_disk_manager_specified(self, *paths: str | pathlib.Path) -> RuntimeConfig: + """Use the specified paths for the disk manager's temporary files. + + Args: + paths: Paths to use for the disk manager's temporary files. + + Returns: + A new `RuntimeConfig` object with the updated setting. + """ + paths = [str(p) for p in paths] + self.config_internal = self.config_internal.with_disk_manager_specified(paths) + return self + + def with_unbounded_memory_pool(self) -> RuntimeConfig: + """Use an unbounded memory pool. + + Returns: + A new `RuntimeConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_unbounded_memory_pool() + return self + + def with_fair_spill_pool(self, size: int) -> RuntimeConfig: + """Use a fair spill pool with the specified size. + + This pool works best when you know beforehand the query has multiple spillable + operators that will likely all need to spill. Sometimes it will cause spills + even when there was sufficient memory (reserved for other operators) to avoid + doing so:: + + ┌───────────────────────z──────────────────────z───────────────┐ + │ z z │ + │ z z │ + │ Spillable z Unspillable z Free │ + │ Memory z Memory z Memory │ + │ z z │ + │ z z │ + └───────────────────────z──────────────────────z───────────────┘ + + Args: + size: Size of the memory pool in bytes. + + Returns: + A new ``RuntimeConfig`` object with the updated setting. + + Examples usage:: + + config = RuntimeConfig().with_fair_spill_pool(1024) + """ + self.config_internal = self.config_internal.with_fair_spill_pool(size) + return self + + def with_greedy_memory_pool(self, size: int) -> RuntimeConfig: + """Use a greedy memory pool with the specified size. + + This pool works well for queries that do not need to spill or have a single + spillable operator. See `RuntimeConfig.with_fair_spill_pool` if there are + multiple spillable operators that all will spill. + + Args: + size: Size of the memory pool in bytes. + + Returns: + A new `RuntimeConfig` object with the updated setting. + + Example usage:: + + config = RuntimeConfig().with_greedy_memory_pool(1024) + """ + self.config_internal = self.config_internal.with_greedy_memory_pool(size) + return self + + def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeConfig: + """Use the specified path to create any needed temporary files. + + Args: + path: Path to use for temporary files. + + Returns: + A new `RuntimeConfig` object with the updated setting. + + Example usage:: + + config = RuntimeConfig().with_temp_file_path("/tmp") + """ + self.config_internal = self.config_internal.with_temp_file_path(str(path)) + return self + + +class SQLOptions: + """Options to be used when performing SQL queries on the ``SessionContext``.""" + + def __init__(self) -> None: + """Create a new `SQLOptions` with default values. + + The default values are: + - DDL commands are allowed + - DML commands are allowed + - Statements are allowed + """ + self.options_internal = SQLOptionsInternal() + + def with_allow_ddl(self, allow: bool = True) -> SQLOptions: + """Should DDL (Data Definition Language) commands be run? + + Examples of DDL commands include `CREATE TABLE` and `DROP TABLE`. + + Args: + allow: Allow DDL commands to be run. + + Returns: + A new `SQLOptions` object with the updated setting. + + Example usage:: + + options = SQLOptions().with_allow_ddl(True) + """ + self.options_internal = self.options_internal.with_allow_ddl(allow) + return self + + def with_allow_dml(self, allow: bool = True) -> SQLOptions: + """Should DML (Data Manipulation Language) commands be run? + + Examples of DML commands include `INSERT INTO` and `DELETE`. + + Args: + allow: Allow DML commands to be run. + + Returns: + A new `SQLOptions` object with the updated setting. + + Example usage:: + + options = SQLOptions().with_allow_dml(True) + """ + self.options_internal = self.options_internal.with_allow_dml(allow) + return self + + def with_allow_statements(self, allow: bool = True) -> SQLOptions: + """Should statements such as `SET VARIABLE` and `BEGIN TRANSACTION` be run? + + Args: + allow: Allow statements to be run. + + Returns: + A new `SQLOptions` object with the updated setting. + + Example usage:: + + options = SQLOptions().with_allow_statements(True) + """ + self.options_internal = self.options_internal.with_allow_statements(allow) + return self + + +class SessionContext: + """This is the main interface for executing queries and creating DataFrames. + + See https://datafusion.apache.org/python/user-guide/basics.html for + additional information. + """ + + def __init__( + self, config: SessionConfig | None = None, runtime: RuntimeConfig | None = None + ) -> None: + """Main interface for executing queries with DataFusion. + + Maintains the state of the connection between a user and an instance + of the connection between a user and an instance of the DataFusion + engine. + + Args: + config: Session configuration options. + runtime: Runtime configuration options. + + Example usage: + + The following example demostrates how to use the context to execute + a query against a CSV data source using the ``DataFrame`` API:: + + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.read_csv("data.csv") + """ + config = config.config_internal if config is not None else None + runtime = runtime.config_internal if config is not None else None + + self.ctx = SessionContextInternal(config, runtime) + + def register_object_store(self, schema: str, store: Any, host: str | None) -> None: + """Add a new object store into the session. + + Args: + schema: The data source schema. + store: The `ObjectStore` to register. + host: URL for the host. + """ + self.ctx.register_object_store(schema, store, host) + + def register_listing_table( + self, + name: str, + path: str | pathlib.Path, + table_partition_cols: list[tuple[str, str]] | None = None, + file_extension: str = ".parquet", + schema: pyarrow.Schema | None = None, + file_sort_order: list[list[Expr]] | None = None, + ) -> None: + """Register multiple files as a single table. + + Registers a `Table` that can assemble multiple files from locations in + an `ObjectStore` instance. + + Args: + name: Name of the resultant table. + path: Path to the file to register. + table_partition_cols: Partition columns. + file_extension: File extension of the provided table. + schema: The data source schema. + file_sort_order: Sort order for the file. + """ + if table_partition_cols is None: + table_partition_cols = [] + if file_sort_order is not None: + file_sort_order = [[x.expr for x in xs] for xs in file_sort_order] + self.ctx.register_listing_table( + name, + str(path), + table_partition_cols, + file_extension, + schema, + file_sort_order, + ) + + def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: + """Create a `DataFrame` from SQL query text. + + Note: This API implements DDL statements such as `CREATE TABLE` and + `CREATE VIEW` and DML statements such as `INSERT INTO` with in-memory + default implementation. See `SessionContext.sql_with_options`. + + Args: + query: SQL query text. + options: If provided, the query will be validated against these options. + + Returns: + DataFrame representation of the SQL query. + """ + if options is None: + return DataFrame(self.ctx.sql(query)) + return DataFrame(self.ctx.sql_with_options(query, options.options_internal)) + + def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: + """Create a `DataFrame` from SQL query text. + + This function will first validating that the query is allowed by the + provided options. + + Args: + query: SQL query text. + options: SQL options. + + Returns: + DataFrame representation of the SQL query. + """ + return self.sql(query, options) + + def create_dataframe( + self, + partitions: list[list[pyarrow.RecordBatch]], + name: str | None = None, + schema: pyarrow.Schema | None = None, + ) -> DataFrame: + """Create and return a dataframe using the provided partitions. + + Args: + partitions: `RecordBatch` partitions to register. + name: Resultant dataframe name. + schema: Schema for the partitions. + + Returns: + DataFrame representation of the SQL query. + """ + return DataFrame(self.ctx.create_dataframe(partitions, name, schema)) + + def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame: + """Create a `DataFrame` from an existing logical plan. + + Args: + plan: Logical plan. + + Returns: + DataFrame representation of the logical plan. + """ + return DataFrame(self.ctx.create_dataframe_from_logical_plan(plan)) + + def from_pylist( + self, data: list[dict[str, Any]], name: str | None = None + ) -> DataFrame: + """Create a `DataFrame` from a list of dictionaries. + + Args: + data: List of dictionaries. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the list of dictionaries. + """ + return DataFrame(self.ctx.from_pylist(data, name)) + + def from_pydict( + self, data: dict[str, list[Any]], name: str | None = None + ) -> DataFrame: + """Create a `DataFrame` from a dictionary of lists. + + Args: + data: Dictionary of lists. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the dictionary of lists. + """ + return DataFrame(self.ctx.from_pydict(data, name)) + + def from_arrow_table( + self, data: pyarrow.Table, name: str | None = None + ) -> DataFrame: + """Create a `DataFrame` from an Arrow table. + + Args: + data: Arrow table. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the Arrow table. + """ + return DataFrame(self.ctx.from_arrow_table(data, name)) + + def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFrame: + """Create a `DataFrame` from a Pandas DataFrame. + + Args: + data: Pandas DataFrame. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the Pandas DataFrame. + """ + return DataFrame(self.ctx.from_pandas(data, name)) + + def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFrame: + """Create a `DataFrame` from a Polars DataFrame. + + Args: + data: Polars DataFrame. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the Polars DataFrame. + """ + return DataFrame(self.ctx.from_polars(data, name)) + + def register_table(self, name: str, table: pyarrow.Table) -> None: + """Register a table with the given name into the session. + + Args: + name: Name of the resultant table. + table: PyArrow table to add to the session context. + """ + self.ctx.register_table(name, table) + + def deregister_table(self, name: str) -> None: + """Remove a table from the session.""" + self.ctx.deregister_table(name) + + def register_record_batches( + self, name: str, partitions: list[list[pyarrow.RecordBatch]] + ) -> None: + """Register record batches as a table. + + This function will convert the provided partitions into a table and + register it into the session using the given name. + + Args: + name: Name of the resultant table. + partitions: Record batches to register as a table. + """ + self.ctx.register_record_batches(name, partitions) + + def register_parquet( + self, + name: str, + path: str | pathlib.Path, + table_partition_cols: list[tuple[str, str]] | None = None, + parquet_pruning: bool = True, + file_extension: str = ".parquet", + skip_metadata: bool = True, + schema: pyarrow.Schema | None = None, + file_sort_order: list[list[Expr]] | None = None, + ) -> None: + """Register a Parquet file as a table. + + The registered table can be referenced from SQL statement executed + against this context. + + Args: + name: Name of the table to register. + path: Path to the Parquet file. + table_partition_cols: Partition columns. + parquet_pruning: Whether the parquet reader should use the + predicate to prune row groups. + file_extension: File extension; only files with this extension are + selected for data input. + skip_metadata: Whether the parquet reader should skip any metadata + that may be in the file schema. This can help avoid schema + conflicts due to metadata. + schema: The data source schema. + file_sort_order: Sort order for the file. + """ + if table_partition_cols is None: + table_partition_cols = [] + self.ctx.register_parquet( + name, + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, + ) + + def register_csv( + self, + name: str, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + has_header: bool = True, + delimiter: str = ",", + schema_infer_max_records: int = 1000, + file_extension: str = ".csv", + file_compression_type: str | None = None, + ) -> None: + """Register a CSV file as a table. + + The registered table can be referenced from SQL statement executed against. + + Args: + name: Name of the table to register. + path: Path to the CSV file. + schema: An optional schema representing the CSV file. If None, the + CSV reader will try to infer it based on data in file. + has_header: Whether the CSV file have a header. If schema inference + is run on a file with no headers, default column names are + created. + delimiter: An optional column delimiter. + schema_infer_max_records: Maximum number of rows to read from CSV + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + file_compression_type: File compression type. + """ + self.ctx.register_csv( + name, + str(path), + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + file_compression_type, + ) + + def register_json( + self, + name: str, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + schema_infer_max_records: int = 1000, + file_extension: str = ".json", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, + ) -> None: + """Register a JSON file as a table. + + The registered table can be referenced from SQL statement executed + against this context. + + Args: + name: Name of the table to register. + path: Path to the JSON file. + schema: The data source schema. + schema_infer_max_records: Maximum number of rows to read from JSON + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + """ + if table_partition_cols is None: + table_partition_cols = [] + self.ctx.register_json( + name, + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + + def register_avro( + self, + name: str, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + file_extension: str = ".avro", + table_partition_cols: list[tuple[str, str]] | None = None, + ) -> None: + """Register an Avro file as a table. + + The registered table can be referenced from SQL statement executed against + this context. + + Args: + name: Name of the table to register. + path: Path to the Avro file. + schema: The data source schema. + file_extension: File extension to select. + table_partition_cols: Partition columns. + """ + if table_partition_cols is None: + table_partition_cols = [] + self.ctx.register_avro( + name, str(path), schema, file_extension, table_partition_cols + ) + + def register_dataset(self, name: str, dataset: pyarrow.dataset.Dataset) -> None: + """Register a `pyarrow.dataset.Dataset` as a table. + + Args: + name: Name of the table to register. + dataset: PyArrow dataset. + """ + self.ctx.register_dataset(name, dataset) + + def register_udf(self, udf: ScalarUDF) -> None: + """Register a user-defined function (UDF) with the context.""" + self.ctx.register_udf(udf.udf) + + def register_udaf(self, udaf: AggregateUDF) -> None: + """Register a user-defined aggregation function (UDAF) with the context.""" + self.ctx.register_udaf(udaf) + + def catalog(self, name: str = "datafusion") -> Catalog: + """Retrieve a catalog by name.""" + return self.ctx.catalog(name) + + @deprecated( + "Use the catalog provider interface `SessionContext.catalog` to " + "examine available catalogs, schemas and tables" + ) + def tables(self) -> set[str]: + """Deprecated.""" + return self.ctx.tables() + + def table(self, name: str) -> DataFrame: + """Retrieve a `DataFrame` representing a previously registered table.""" + return DataFrame(self.ctx.table(name)) + + def table_exist(self, name: str) -> bool: + """Return whether a table with the given name exists.""" + return self.ctx.table_exist(name) + + def empty_table(self) -> DataFrame: + """Create an empty `DataFrame`.""" + return DataFrame(self.ctx.empty_table()) + + def session_id(self) -> str: + """Retrun an id that uniquely identifies this `SessionContext`.""" + return self.ctx.session_id() + + def read_json( + self, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + schema_infer_max_records: int = 1000, + file_extension: str = ".json", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, + ) -> DataFrame: + """Create a `DataFrame` for reading a line-delimited JSON data source. + + Args: + path: Path to the JSON file. + schema: The data source schema. + schema_infer_max_records: Maximum number of rows to read from JSON + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read JSON files. + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + self.ctx.read_json( + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + def read_csv( + self, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + has_header: bool = True, + delimiter: str = ",", + schema_infer_max_records: int = 1000, + file_extension: str = ".csv", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, + ) -> DataFrame: + """Create a `DataFrame` for reading a CSV data source. + + Args: + path: Path to the CSV file + schema: An optional schema representing the CSV files. If None, the + CSV reader will try to infer it based on data in file. + has_header: Whether the CSV file have a header. If schema inference + is run on a file with no headers, default column names are + created. + delimiter: An optional column delimiter. + schema_infer_max_records: Maximum number of rows to read from CSV + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read CSV files + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + self.ctx.read_csv( + str(path), + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + def read_parquet( + self, + path: str | pathlib.Path, + table_partition_cols: list[tuple[str, str]] | None = None, + parquet_pruning: bool = True, + file_extension: str = ".parquet", + skip_metadata: bool = True, + schema: pyarrow.Schema | None = None, + file_sort_order: list[list[Expr]] | None = None, + ) -> DataFrame: + """Create a `DataFrame` for reading Parquet data source. + + Args: + path: Path to the Parquet file. + table_partition_cols: Partition columns. + parquet_pruning: Whether the parquet reader should use the predicate + to prune row groups. + file_extension: File extension; only files with this extension are + selected for data input. + skip_metadata: Whether the parquet reader should skip any metadata + that may be in the file schema. This can help avoid schema + conflicts due to metadata. + schema: An optional schema representing the parquet files. If None, + the parquet reader will try to infer it based on data in the + file. + file_sort_order: Sort order for the file. + + Returns: + DataFrame representation of the read Parquet files + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + self.ctx.read_parquet( + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, + ) + ) + + def read_avro( + self, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + file_partition_cols: list[tuple[str, str]] | None = None, + file_extension: str = ".avro", + ) -> DataFrame: + """Create a ``DataFrame`` for reading Avro data source. + + Args: + path: Path to the Avro file. + schema: The data source schema. + file_partition_cols: Partition columns. + file_extension: File extension to select. + + Returns: + DataFrame representation of the read Avro file + """ + if file_partition_cols is None: + file_partition_cols = [] + return DataFrame( + self.ctx.read_avro(str(path), schema, file_partition_cols, file_extension) + ) + + def read_table(self, table: Table) -> DataFrame: + """Creates a ``DataFrame`` for a ``Table`` such as a ``ListingTable``.""" + return DataFrame(self.ctx.read_table(table)) + + def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: + """Execute the `plan` and return the results.""" + return RecordBatchStream(self.ctx.execute(plan, partitions)) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py new file mode 100644 index 000000000..68e6298f7 --- /dev/null +++ b/python/datafusion/dataframe.py @@ -0,0 +1,527 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""DataFrame is one of the core concepts in DataFusion. + +See https://datafusion.apache.org/python/user-guide/basics.html for more +information. +""" + +from __future__ import annotations + +from typing import Any, List, TYPE_CHECKING +from datafusion.record_batch import RecordBatchStream +from typing_extensions import deprecated + +if TYPE_CHECKING: + import pyarrow as pa + import pandas as pd + import polars as pl + import pathlib + +from datafusion._internal import DataFrame as DataFrameInternal +from datafusion.expr import Expr +from datafusion._internal import ( + LogicalPlan, + ExecutionPlan, +) + + +class DataFrame: + """Two dimensional table representation of data. + + See https://datafusion.apache.org/python/user-guide/basics.html for more + information. + """ + + def __init__(self, df: DataFrameInternal) -> None: + """This constructor is not to be used by the end user. + + See ``SessionContext`` for methods to create DataFrames. + """ + self.df = df + + def __getitem__(self, key: str | List[str]) -> DataFrame: + """Return a new `DataFrame` with the specified column or columns. + + Args: + key: Column name or list of column names to select. + + Returns: + DataFrame with the specified column or columns. + """ + return DataFrame(self.df.__getitem__(key)) + + def __repr__(self) -> str: + """Return a string representation of the DataFrame. + + Returns: + String representation of the DataFrame. + """ + return self.df.__repr__() + + def describe(self) -> DataFrame: + """Return a new `DataFrame` that has statistics for a DataFrame. + + Only summarized numeric datatypes at the moments and returns nulls + for non-numeric datatypes. + + The output format is modeled after pandas. + + Returns: + A summary DataFrame containing statistics. + """ + return DataFrame(self.df.describe()) + + def schema(self) -> pa.Schema: + """Return the `pyarrow.Schema` describing the output of this DataFrame. + + The output schema contains information on the name, data type, and + nullability for each column. + + Returns: + Describing schema of the DataFrame + """ + return self.df.schema() + + def select_columns(self, *args: str) -> DataFrame: + """Filter the DataFrame by columns. + + Returns: + DataFrame only containing the specified columns. + """ + return self.select(*args) + + def select(self, *exprs: Expr | str) -> DataFrame: + """Project arbitrary expressions into a new `DataFrame`. + + Args: + exprs: Either column names or `Expr` to select. + + Returns: + DataFrame after projection. It has one column for each expression. + + Example usage: + + The following example will return 3 columns from the original dataframe. + The first two columns will be the original column `a` and `b` since the + string "a" is assumed to refer to column selection. Also a duplicate of + column `a` will be returned with the column name `alternate_a`:: + + df = df.select("a", col("b"), col("a").alias("alternate_a")) + + """ + exprs = [ + arg.expr if isinstance(arg, Expr) else Expr.column(arg).expr + for arg in exprs + ] + return DataFrame(self.df.select(*exprs)) + + def filter(self, *predicates: Expr) -> DataFrame: + """Return a DataFrame for which `predicate` evaluates to `True`. + + Rows for which `predicate` evaluates to `False` or `None` are filtered + out. If more than one predicate is provided, these predicates will be + combined as a logical AND. If more complex logic is required, see the + logical operations in `datafusion.functions`. + + Args: + predicates: Predicate expression(s) to filter the DataFrame. + + Returns: + DataFrame after filtering. + """ + df = self.df + for p in predicates: + df = df.filter(p.expr) + return DataFrame(df) + + def with_column(self, name: str, expr: Expr) -> DataFrame: + """Add an additional column to the DataFrame. + + Args: + name: Name of the column to add. + expr: Expression to compute the column. + + Returns: + DataFrame with the new column. + """ + return DataFrame(self.df.with_column(name, expr.expr)) + + def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame: + """Rename one column by applying a new projection. + + This is a no-op if the column to be renamed does not exist. + + The method supports case sensitive rename with wrapping column name + into one the following symbols (" or ' or `). + + Args: + old_name: Old column name. + new_name: New column name. + + Returns: + DataFrame with the column renamed. + """ + return DataFrame(self.df.with_column_renamed(old_name, new_name)) + + def aggregate(self, group_by: list[Expr], aggs: list[Expr]) -> DataFrame: + """Aggregates the rows of the current DataFrame. + + Args: + group_by: List of expressions to group by. + aggs: List of expressions to aggregate. + + Returns: + DataFrame after aggregation. + """ + group_by = [e.expr for e in group_by] + aggs = [e.expr for e in aggs] + return DataFrame(self.df.aggregate(group_by, aggs)) + + def sort(self, *exprs: Expr) -> DataFrame: + """Sort the DataFrame by the specified sorting expressions. + + Note that any expression can be turned into a sort expression by + calling its `sort` method. + + Args: + exprs: Sort expressions, applied in order. + + Returns: + DataFrame after sorting. + """ + exprs = [expr.expr for expr in exprs] + return DataFrame(self.df.sort(*exprs)) + + def limit(self, count: int, offset: int = 0) -> DataFrame: + """Return a new `DataFrame` with a limited number of rows. + + Args: + count: Number of rows to limit the DataFrame to. + offset: Number of rows to skip. + + Returns: + DataFrame after limiting. + """ + return DataFrame(self.df.limit(count, offset)) + + def collect(self) -> list[pa.RecordBatch]: + """Execute this `DataFrame` and collect results into memory. + + Prior to calling `collect`, modifying a DataFrme simply updates a plan + (no actual computation is performed). Calling `collect` triggers the + computation. + + Returns: + List of `pyarrow.RecordBatch`es collected from the DataFrame. + """ + return self.df.collect() + + def cache(self) -> DataFrame: + """Cache the DataFrame as a memory table. + + Returns: + Cached DataFrame. + """ + return DataFrame(self.df.cache()) + + def collect_partitioned(self) -> list[list[pa.RecordBatch]]: + """Execute this DataFrame and collect all partitioned results. + + This operation returns ``RecordBatch`` maintaining the input + partitioning. + + Returns: + List of list of ``RecordBatch`` collected from the + DataFrame. + """ + return self.df.collect_partitioned() + + def show(self, num: int = 20) -> None: + """Execute the DataFrame and print the result to the console. + + Args: + num: Number of lines to show. + """ + self.df.show(num) + + def distinct(self) -> DataFrame: + """Return a new `DataFrame` with all duplicated rows removed. + + Returns: + DataFrame after removing duplicates. + """ + return DataFrame(self.df.distinct()) + + def join( + self, + right: DataFrame, + join_keys: tuple[list[str], list[str]], + how: str, + ) -> DataFrame: + """Join this `DataFrame` with another `DataFrame`. + + Join keys are a pair of lists of column names in the left and right + dataframes, respectively. These lists must have the same length. + + Args: + right: Other DataFrame to join with. + join_keys: Tuple of two lists of column names to join on. + how: Type of join to perform. Supported types are "inner", "left", + "right", "full", "semi", "anti". + + Returns: + DataFrame after join. + """ + return DataFrame(self.df.join(right.df, join_keys, how)) + + def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: + """Return a DataFrame with the explanation of its plan so far. + + If `analyze` is specified, runs the plan and reports metrics. + + Args: + verbose: If `True`, more details will be included. + analyze: If `True`, the plan will run and metrics reported. + + Returns: + DataFrame with the explanation of its plan. + """ + return DataFrame(self.df.explain(verbose, analyze)) + + def logical_plan(self) -> LogicalPlan: + """Return the unoptimized `LogicalPlan` that comprises this `DataFrame`. + + Returns: + Unoptimized logical plan. + """ + return self.df.logical_plan() + + def optimized_logical_plan(self) -> LogicalPlan: + """Return the optimized `LogicalPlan` that comprises this `DataFrame`. + + Returns: + Optimized logical plan. + """ + return self.df.optimized_logical_plan() + + def execution_plan(self) -> ExecutionPlan: + """Return the execution/physical plan that comprises this `DataFrame`. + + Returns: + Execution plan. + """ + return self.df.execution_plan() + + def repartition(self, num: int) -> DataFrame: + """Repartition a DataFrame into `num` partitions. + + The batches allocation uses a round-robin algorithm. + + Args: + num: Number of partitions to repartition the DataFrame into. + + Returns: + Repartitioned DataFrame. + """ + return DataFrame(self.df.repartition(num)) + + def repartition_by_hash(self, *exprs: Expr, num: int) -> DataFrame: + """Repartition a DataFrame using a hash partitioning scheme. + + Args: + exprs: Expressions to evaluate and perform hashing on. + num: Number of partitions to repartition the DataFrame into. + + Returns: + Repartitioned DataFrame. + """ + exprs = [expr.expr for expr in exprs] + return DataFrame(self.df.repartition_by_hash(*exprs, num=num)) + + def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: + """Calculate the union of two `DataFrame`s. + + The two `DataFrame`s must have exactly the same schema. + + Args: + other: DataFrame to union with. + distinct: If `True`, duplicate rows will be removed. + + Returns: + DataFrame after union. + """ + return DataFrame(self.df.union(other.df, distinct)) + + def union_distinct(self, other: DataFrame) -> DataFrame: + """Calculate the distinct union of two `DataFrame`s. + + The two `DataFrame`s must have exactly the same schema. + Any duplicate rows are discarded. + + Args: + other: DataFrame to union with. + + Returns: + DataFrame after union. + """ + return DataFrame(self.df.union_distinct(other.df)) + + def intersect(self, other: DataFrame) -> DataFrame: + """Calculate the intersection of two `DataFrame`s. + + The two `DataFrame`s must have exactly the same schema. + + Args: + other: DataFrame to intersect with. + + Returns: + DataFrame after intersection. + """ + return DataFrame(self.df.intersect(other.df)) + + def except_all(self, other: DataFrame) -> DataFrame: + """Calculate the exception of two `DataFrame`s. + + The two `DataFrame`s must have exactly the same schema. + + Args: + other: DataFrame to calculate exception with. + + Returns: + DataFrame after exception. + """ + return DataFrame(self.df.except_all(other.df)) + + def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None: + """Execute the `DataFrame` and write the results to a CSV file. + + Args: + path: Path of the CSV file to write. + with_header: If true, output the CSV header row. + """ + self.df.write_csv(str(path), with_header) + + def write_parquet( + self, + path: str | pathlib.Path, + compression: str = "uncompressed", + compression_level: int | None = None, + ) -> None: + """Execute the `DataFrame` and write the results to a Parquet file. + + Args: + path: Path of the Parquet file to write. + compression: Compression type to use. + compression_level: Compression level to use. + """ + self.df.write_parquet(str(path), compression, compression_level) + + def write_json(self, path: str | pathlib.Path) -> None: + """Execute the `DataFrame` and write the results to a JSON file. + + Args: + path: Path of the JSON file to write. + """ + self.df.write_json(str(path)) + + def to_arrow_table(self) -> pa.Table: + """Execute the `DataFrame` and convert it into an Arrow Table. + + Returns: + Arrow Table. + """ + return self.df.to_arrow_table() + + def execute_stream(self) -> RecordBatchStream: + """Executes this DataFrame and returns a stream over a single partition. + + Returns: + Record Batch Stream over a single partition. + """ + return RecordBatchStream(self.df.execute_stream()) + + def execute_stream_partitioned(self) -> list[RecordBatchStream]: + """Executes this DataFrame and returns a stream for each partition. + + Returns: + One record batch stream per partition. + """ + streams = self.df.execute_stream_partitioned() + return [RecordBatchStream(rbs) for rbs in streams] + + def to_pandas(self) -> pd.DataFrame: + """Execute the `DataFrame` and convert it into a Pandas DataFrame. + + Returns: + Pandas DataFrame. + """ + return self.df.to_pandas() + + def to_pylist(self) -> list[dict[str, Any]]: + """Execute the `DataFrame` and convert it into a list of dictionaries. + + Returns: + List of dictionaries. + """ + return self.df.to_pylist() + + def to_pydict(self) -> dict[str, list[Any]]: + """Execute the `DataFrame` and convert it into a dictionary of lists. + + Returns: + Dictionary of lists. + """ + return self.df.to_pydict() + + def to_polars(self) -> pl.DataFrame: + """Execute the `DataFrame` and convert it into a Polars DataFrame. + + Returns: + Polars DataFrame. + """ + return self.df.to_polars() + + def count(self) -> int: + """Return the total number of rows in this `DataFrame`. + + Note that this method will actually run a plan to calculate the + count, which may be slow for large or complicated DataFrames. + + Returns: + Number of rows in the DataFrame. + """ + return self.df.count() + + @deprecated("Use :func:`unnest_columns` instead.") + def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame: + """See ``unnest_columns``.""" + return DataFrame(self.df.unnest_column(column, preserve_nulls=preserve_nulls)) + + def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFrame: + """Expand columns of arrays into a single row per array element. + + Args: + columns: Column names to perform unnest operation on. + preserve_nulls: If False, rows with null entries will not be + returned. + + Returns: + A DataFrame with the columns expanded. + """ + columns = [c for c in columns] + return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index e914b85d7..c04a525a6 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -15,9 +15,417 @@ # specific language governing permissions and limitations # under the License. +"""This module supports expressions, one of the core concepts in DataFusion. -from ._internal import expr +See ``Expr`` for more details. +""" +from __future__ import annotations -def __getattr__(name): - return getattr(expr, name) +from ._internal import expr as expr_internal, LogicalPlan +from datafusion.common import RexType, DataTypeMap +from typing import Any +import pyarrow as pa + +# The following are imported from the internal representation. We may choose to +# give these all proper wrappers, or to simply leave as is. These were added +# in order to support passing the `test_imports` unit test. +# Tim Saucer note: It is not clear to me what the use case is for exposing +# these definitions to the end user. + +Alias = expr_internal.Alias +Analyze = expr_internal.Analyze +Aggregate = expr_internal.Aggregate +AggregateFunction = expr_internal.AggregateFunction +Between = expr_internal.Between +BinaryExpr = expr_internal.BinaryExpr +Case = expr_internal.Case +Cast = expr_internal.Cast +Column = expr_internal.Column +CreateMemoryTable = expr_internal.CreateMemoryTable +CreateView = expr_internal.CreateView +CrossJoin = expr_internal.CrossJoin +Distinct = expr_internal.Distinct +DropTable = expr_internal.DropTable +Exists = expr_internal.Exists +Explain = expr_internal.Explain +Extension = expr_internal.Extension +Filter = expr_internal.Filter +GroupingSet = expr_internal.GroupingSet +Join = expr_internal.Join +ILike = expr_internal.ILike +InList = expr_internal.InList +InSubquery = expr_internal.InSubquery +IsFalse = expr_internal.IsFalse +IsNotTrue = expr_internal.IsNotTrue +IsTrue = expr_internal.IsTrue +IsUnknown = expr_internal.IsUnknown +IsNotFalse = expr_internal.IsNotFalse +IsNotNull = expr_internal.IsNotNull +IsNotUnknown = expr_internal.IsNotUnknown +JoinConstraint = expr_internal.JoinConstraint +JoinType = expr_internal.JoinType +Like = expr_internal.Like +Limit = expr_internal.Limit +Literal = expr_internal.Literal +Negative = expr_internal.Negative +Not = expr_internal.Not +Partitioning = expr_internal.Partitioning +Placeholder = expr_internal.Placeholder +Projection = expr_internal.Projection +Repartition = expr_internal.Repartition +ScalarSubquery = expr_internal.ScalarSubquery +ScalarVariable = expr_internal.ScalarVariable +SimilarTo = expr_internal.SimilarTo +Sort = expr_internal.Sort +Subquery = expr_internal.Subquery +SubqueryAlias = expr_internal.SubqueryAlias +TableScan = expr_internal.TableScan +TryCast = expr_internal.TryCast +Union = expr_internal.Union + + +class Expr: + """Expression object. + + Expressions are one of the core concepts in DataFusion. See + https://datafusion.apache.org/python/user-guide/common-operations/expressions.html + for more information. + """ + + def __init__(self, expr: expr_internal.Expr) -> None: + """This constructor should not be called by the end user.""" + self.expr = expr + + def to_variant(self) -> Any: + """Convert this expression into a python object if possible.""" + return self.expr.to_variant() + + def display_name(self) -> str: + """Returns the name of this expression as it should appear in a schema. + + This name will not include any CAST expressions. + """ + return self.expr.display_name() + + def canonical_name(self) -> str: + """Returns a complete string representation of this expression.""" + return self.expr.canonical_name() + + def variant_name(self) -> str: + """Returns the name of the Expr variant. + + Ex: ``IsNotNull``, ``Literal``, ``BinaryExpr``, etc + """ + return self.expr.variant_name() + + def __richcmp__(self, other: Expr, op: int) -> Expr: + """Comparison operator.""" + return Expr(self.expr.__richcmp__(other, op)) + + def __repr__(self) -> str: + """Generate a string representation of this expression.""" + return self.expr.__repr__() + + def __add__(self, rhs: Any) -> Expr: + """Addition operator. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__add__(rhs.expr)) + + def __sub__(self, rhs: Any) -> Expr: + """Subtraction operator. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__sub__(rhs.expr)) + + def __truediv__(self, rhs: Any) -> Expr: + """Division operator. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__truediv__(rhs.expr)) + + def __mul__(self, rhs: Any) -> Expr: + """Multiplication operator. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__mul__(rhs.expr)) + + def __mod__(self, rhs: Any) -> Expr: + """Modulo operator (%). + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__mod__(rhs.expr)) + + def __and__(self, rhs: Expr) -> Expr: + """Logical AND.""" + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__and__(rhs.expr)) + + def __or__(self, rhs: Expr) -> Expr: + """Logical OR.""" + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__or__(rhs.expr)) + + def __invert__(self) -> Expr: + """Binary not (~).""" + return Expr(self.expr.__invert__()) + + def __getitem__(self, key: str) -> Expr: + """For struct data types, return the field indicated by ``key``.""" + return Expr(self.expr.__getitem__(key)) + + def __eq__(self, rhs: Any) -> Expr: + """Equal to. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__eq__(rhs.expr)) + + def __ne__(self, rhs: Any) -> Expr: + """Not equal to. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__ne__(rhs.expr)) + + def __ge__(self, rhs: Any) -> Expr: + """Greater than or equal to. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__ge__(rhs.expr)) + + def __gt__(self, rhs: Any) -> Expr: + """Greater than. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__gt__(rhs.expr)) + + def __le__(self, rhs: Any) -> Expr: + """Less than or equal to. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__le__(rhs.expr)) + + def __lt__(self, rhs: Any) -> Expr: + """Less than. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__lt__(rhs.expr)) + + @staticmethod + def literal(value: Any) -> Expr: + """Creates a new expression representing a scalar value. + + `value` must be a valid PyArrow scalar value or easily castable to one. + """ + if not isinstance(value, pa.Scalar): + value = pa.scalar(value) + return Expr(expr_internal.Expr.literal(value)) + + @staticmethod + def column(value: str) -> Expr: + """Creates a new expression representing a column in a ``DataFrame``.""" + return Expr(expr_internal.Expr.column(value)) + + def alias(self, name: str) -> Expr: + """Assign a name to the expression.""" + return Expr(self.expr.alias(name)) + + def sort(self, ascending: bool = True, nulls_first: bool = True) -> Expr: + """Creates a sort ``Expr`` from an existing ``Expr``. + + Args: + ascending: If true, sort in ascending order. + nulls_first: Return null values first. + """ + return Expr(self.expr.sort(ascending=ascending, nulls_first=nulls_first)) + + def is_null(self) -> Expr: + """Returns ``True`` if this expression is null.""" + return Expr(self.expr.is_null()) + + def cast(self, to: pa.DataType[Any]) -> Expr: + """Cast to a new data type.""" + return Expr(self.expr.cast(to)) + + def rex_type(self) -> RexType: + """Return the Rex Type of this expression. + + A Rex (Row Expression) specifies a single row of data.That specification + could include user defined functions or types. RexType identifies the + row as one of the possible valid ``RexType``(s). + """ + return self.expr.rex_type() + + def types(self) -> DataTypeMap: + """Return the ``DataTypeMap``. + + Returns: + DataTypeMap which represents the PythonType, Arrow DataType, and + SqlType Enum which this expression represents. + """ + return self.expr.types() + + def python_value(self) -> Any: + """Extracts the Expr value into a PyObject. + + This is only valid for literal expressions. + + Returns: + Python object representing literal value of the expression. + """ + return self.expr.python_value() + + def rex_call_operands(self) -> list[Expr]: + """Return the operands of the expression based on it's variant type. + + Row expressions, Rex(s), operate on the concept of operands. Different + variants of Expressions, Expr(s), store those operands in different + datastructures. This function examines the Expr variant and returns + the operands to the calling logic. + """ + return [Expr(e) for e in self.expr.rex_call_operands()] + + def rex_call_operator(self) -> str: + """Extracts the operator associated with a row expression type call.""" + return self.expr.rex_call_operator() + + def column_name(self, plan: LogicalPlan) -> str: + """Compute the output column name based on the provided logical plan.""" + return self.expr.column_name(plan) + + +class WindowFrame: + """Defines a window frame for performing window operations.""" + + def __init__( + self, units: str, start_bound: int | None, end_bound: int | None + ) -> None: + """Construct a window frame using the given parameters. + + Args: + units: Should be one of `rows`, `range`, or `groups`. + start_bound: Sets the preceeding bound. Must be >= 0. If none, this + will be set to unbounded. If unit type is `groups`, this + parameter must be set. + end_bound: Sets the following bound. Must be >= 0. If none, this + will be set to unbounded. If unit type is `groups`, this + parameter must be set. + """ + self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound) + + def get_frame_units(self) -> str: + """Returns the window frame units for the bounds.""" + return self.window_frame.get_frame_units() + + def get_lower_bound(self) -> WindowFrameBound: + """Returns starting bound.""" + return WindowFrameBound(self.window_frame.get_lower_bound()) + + def get_upper_bound(self): + """Returns end bound.""" + return WindowFrameBound(self.window_frame.get_upper_bound()) + + +class WindowFrameBound: + """Defines a single window frame bound. + + ``WindowFrame`` typically requires a start and end bound. + """ + + def __init__(self, frame_bound: expr_internal.WindowFrameBound) -> None: + """Constructs a window frame bound.""" + self.frame_bound = frame_bound + + def get_offset(self) -> int | None: + """Returns the offset of the window frame.""" + return self.frame_bound.get_offset() + + def is_current_row(self) -> bool: + """Returns if the frame bound is current row.""" + return self.frame_bound.is_current_row() + + def is_following(self) -> bool: + """Returns if the frame bound is following.""" + return self.frame_bound.is_following() + + def is_preceding(self) -> bool: + """Returns if the frame bound is preceding.""" + return self.frame_bound.is_preceding() + + def is_unbounded(self) -> bool: + """Returns if the frame bound is unbounded.""" + return self.frame_bound.is_unbounded() + + +class CaseBuilder: + """Builder class for constructing case statements. + + An example usage would be as follows:: + + import datafusion.functions as f + from datafusion import lit, col + df.select( + f.case(col("column_a") + .when(lit(1), lit("One")) + .when(lit(2), lit("Two")) + .otherwise(lit("Unknown")) + ) + """ + + def __init__(self, case_builder: expr_internal.CaseBuilder) -> None: + """Constructs a case builder. + + This is not typically called by the end user directly. See + ``datafusion.functions.case`` instead. + """ + self.case_builder = case_builder + + def when(self, when_expr: Expr, then_expr: Expr) -> CaseBuilder: + """Add a case to match against.""" + return CaseBuilder(self.case_builder.when(when_expr.expr, then_expr.expr)) + + def otherwise(self, else_expr: Expr) -> Expr: + """Set a default value for the case statement.""" + return Expr(self.case_builder.otherwise(else_expr.expr)) + + def end(self) -> Expr: + """Finish building a case statement. + + Any non-matching cases will end in a `null` value. + """ + return Expr(self.case_builder.end()) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 782ecba22..ad77712ed 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -14,10 +14,1475 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""This module contains the user functions for operating on ``Expr``.""" +from __future__ import annotations -from ._internal import functions +# from datafusion._internal.context import SessionContext +# from datafusion._internal.expr import Expr +# from datafusion._internal.expr.conditional_expr import CaseBuilder +# from datafusion._internal.expr.window import WindowFrame +from datafusion._internal import functions as f, common +from datafusion.expr import CaseBuilder, Expr, WindowFrame +from datafusion.context import SessionContext -def __getattr__(name): - return getattr(functions, name) + +def isnan(expr: Expr) -> Expr: + """Returns true if a given number is +NaN or -NaN otherwise returns false.""" + return Expr(f.isnan(expr.expr)) + + +def nullif(expr1: Expr, expr2: Expr) -> Expr: + """Returns NULL if expr1 equals expr2; otherwise it returns expr1. + + This can be used to perform the inverse operation of the COALESCE expression. + """ + return Expr(f.nullif(expr1.expr, expr2.expr)) + + +def encode(input: Expr, encoding: Expr) -> Expr: + """Encode the `input`, using the `encoding`. encoding can be base64 or hex.""" + return Expr(f.encode(input.expr, encoding.expr)) + + +def decode(input: Expr, encoding: Expr) -> Expr: + """Decode the `input`, using the `encoding`. encoding can be base64 or hex.""" + return Expr(f.decode(input.expr, encoding.expr)) + + +def array_to_string(expr: Expr, delimiter: Expr) -> Expr: + """Converts each element to its text representation.""" + return Expr(f.array_to_string(expr.expr, delimiter.expr)) + + +def array_join(expr: Expr, delimiter: Expr) -> Expr: + """Converts each element to its text representation. + + This is an alias for :func:`array_to_string`. + """ + return array_to_string(expr, delimiter) + + +def list_to_string(expr: Expr, delimiter: Expr) -> Expr: + """Converts each element to its text representation. + + This is an alias for :func:`array_to_string`. + """ + return array_to_string(expr, delimiter) + + +def list_join(expr: Expr, delimiter: Expr) -> Expr: + """Converts each element to its text representation. + + This is an alias for :func:`array_to_string`. + """ + return array_to_string(expr, delimiter) + + +def in_list(arg: Expr, values: list[Expr], negated: bool = False) -> Expr: + """Returns whether the argument is contained within the list `values`.""" + values = [v.expr for v in values] + return Expr(f.in_list(arg.expr, values, negated)) + + +def digest(value: Expr, method: Expr) -> Expr: + """Computes the binary hash of an expression using the specified algorithm. + + Standard algorithms are md5, sha224, sha256, sha384, sha512, blake2s, + blake2b, and blake3. + """ + return Expr(f.digest(value.expr, method.expr)) + + +def concat(*args: Expr) -> Expr: + """Concatenates the text representations of all the arguments. + + NULL arguments are ignored. + """ + args = [arg.expr for arg in args] + return Expr(f.concat(*args)) + + +def concat_ws(separator: str, *args: Expr) -> Expr: + """Concatenates the list `args` with the separator. + + `NULL` arugments are ignored. `separator` should not be `NULL`. + """ + args = [arg.expr for arg in args] + return Expr(f.concat_ws(separator, *args)) + + +def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> Expr: + """Creates a new sort expression.""" + return Expr(f.order_by(expr.expr, ascending, nulls_first)) + + +def alias(expr: Expr, name: str) -> Expr: + """Creates an alias expression.""" + return Expr(f.alias(expr.expr, name)) + + +def col(name: str) -> Expr: + """Creates a column reference expression.""" + return Expr(f.col(name)) + + +def count_star() -> Expr: + """Create a COUNT(1) aggregate expression.""" + return Expr(f.count_star()) + + +def case(expr: Expr) -> CaseBuilder: + """Create a ``CaseBuilder`` to match cases for the expression ``expr``. + + See ``datafusion.expr.CaseBuilder`` for detailed usage of ``CaseBuilder``. + """ + return CaseBuilder(f.case(expr.expr)) + + +def window( + name: str, + args: list[Expr], + partition_by: list[Expr] | None = None, + order_by: list[Expr] | None = None, + window_frame: WindowFrame | None = None, + ctx: SessionContext | None = None, +) -> Expr: + """Creates a new Window function expression.""" + args = [a.expr for a in args] + partition_by = [e.expr for e in partition_by] if partition_by is not None else None + order_by = [o.expr for o in order_by] if order_by is not None else None + window_frame = window_frame.window_frame if window_frame is not None else None + return Expr(f.window(name, args, partition_by, order_by, window_frame, ctx)) + + +# scalar functions +def abs(arg: Expr) -> Expr: + """Return the absolute value of a given number. + + Returns: + -------- + Expr + A new expression representing the absolute value of the input expression. + """ + return Expr(f.abs(arg.expr)) + + +def acos(arg: Expr) -> Expr: + """Returns the arc cosine or inverse cosine of a number. + + Returns: + -------- + Expr + A new expression representing the arc cosine of the input expression. + """ + return Expr(f.acos(arg.expr)) + + +def acosh(arg: Expr) -> Expr: + """Returns inverse hyperbolic cosine.""" + return Expr(f.acosh(arg.expr)) + + +def ascii(arg: Expr) -> Expr: + """Returns the numeric code of the first character of the argument.""" + return Expr(f.ascii(arg.expr)) + + +def asin(arg: Expr) -> Expr: + """Returns the arc sine or inverse sine of a number.""" + return Expr(f.asin(arg.expr)) + + +def asinh(arg: Expr) -> Expr: + """Returns inverse hyperbolic sine.""" + return Expr(f.asinh(arg.expr)) + + +def atan(arg: Expr) -> Expr: + """Returns inverse tangent of a number.""" + return Expr(f.atan(arg.expr)) + + +def atanh(arg: Expr) -> Expr: + """Returns inverse hyperbolic tangent.""" + return Expr(f.atanh(arg.expr)) + + +def atan2(y: Expr, x: Expr) -> Expr: + """Returns inverse tangent of a division given in the argument.""" + return Expr(f.atan2(y.expr, x.expr)) + + +def bit_length(arg: Expr) -> Expr: + """Returns the number of bits in the string argument.""" + return Expr(f.bit_length(arg.expr)) + + +def btrim(arg: Expr) -> Expr: + """Removes all characters, spaces by default, from both sides of a string.""" + return Expr(f.btrim(arg.expr)) + + +def cbrt(arg: Expr) -> Expr: + """Returns the cube root of a number.""" + return Expr(f.cbrt(arg.expr)) + + +def ceil(arg: Expr) -> Expr: + """Returns the nearest integer greater than or equal to argument.""" + return Expr(f.ceil(arg.expr)) + + +def character_length(arg: Expr) -> Expr: + """Returns the number of characters in the argument.""" + return Expr(f.character_length(arg.expr)) + + +def length(string: Expr) -> Expr: + """The number of characters in the `string`.""" + return Expr(f.length(string.expr)) + + +def char_length(string: Expr) -> Expr: + """The number of characters in the `string`.""" + return Expr(f.char_length(string.expr)) + + +def chr(arg: Expr) -> Expr: + """Converts the Unicode code point to a UTF8 character.""" + return Expr(f.chr(arg.expr)) + + +def coalesce(*args: Expr) -> Expr: + """Returns the value of the first expr in `args` which is not NULL.""" + args = [arg.expr for arg in args] + return Expr(f.coalesce(*args)) + + +def cos(arg: Expr) -> Expr: + """Returns the cosine of the argument.""" + return Expr(f.cos(arg.expr)) + + +def cosh(arg: Expr) -> Expr: + """Returns the hyperbolic cosine of the argument.""" + return Expr(f.cosh(arg.expr)) + + +def cot(arg: Expr) -> Expr: + """Returns the cotangent of the argument.""" + return Expr(f.cot(arg.expr)) + + +def degrees(arg: Expr) -> Expr: + """Converts the argument from radians to degrees.""" + return Expr(f.degrees(arg.expr)) + + +def ends_with(arg: Expr, suffix: Expr) -> Expr: + """Returns true if the `string` ends with the `suffix`, false otherwise.""" + return Expr(f.ends_with(arg.expr, suffix.expr)) + + +def exp(arg: Expr) -> Expr: + """Returns the exponential of the arugment.""" + return Expr(f.exp(arg.expr)) + + +def factorial(arg: Expr) -> Expr: + """Returns the factorial of the argument.""" + return Expr(f.factorial(arg.expr)) + + +def find_in_set(string: Expr, string_list: Expr) -> Expr: + """Find a string in a list of strings. + + Returns a value in the range of 1 to N if the string is in the string list + `string_list` consisting of N substrings. + + The string list is a string composed of substrings separated by `,` characters. + """ + return Expr(f.find_in_set(string.expr, string_list.expr)) + + +def floor(arg: Expr) -> Expr: + """Returns the nearest integer less than or equal to the argument.""" + return Expr(f.floor(arg.expr)) + + +def gcd(x: Expr, y: Expr) -> Expr: + """Returns the greatest common divisor.""" + return Expr(f.gcd(x.expr, y.expr)) + + +def initcap(string: Expr) -> Expr: + """Set the initial letter of each word to capital. + + Converts the first letter of each word in `string` to uppercase and the remaining + characters to lowercase. + """ + return Expr(f.initcap(string.expr)) + + +def instr(string: Expr, substring: Expr) -> Expr: + """Finds the position from where the `substring` matches the `string`. + + This is an alias for :func:`strpos`. + """ + return strpos(string, substring) + + +def iszero(arg: Expr) -> Expr: + """Returns true if a given number is +0.0 or -0.0 otherwise returns false.""" + return Expr(f.iszero(arg.expr)) + + +def lcm(x: Expr, y: Expr) -> Expr: + """Returns the least common multiple.""" + return Expr(f.lcm(x.expr, y.expr)) + + +def left(string: Expr, n: Expr) -> Expr: + """Returns the first `n` characters in the `string`.""" + return Expr(f.left(string.expr, n.expr)) + + +def levenshtein(string1: Expr, string2: Expr) -> Expr: + """Returns the Levenshtein distance between the two given strings.""" + return Expr(f.levenshtein(string1.expr, string2.expr)) + + +def ln(arg: Expr) -> Expr: + """Returns the natural logarithm (base e) of the argument.""" + return Expr(f.ln(arg.expr)) + + +def log(base: Expr, num: Expr) -> Expr: + """Returns the logarithm of a number for a particular `base`.""" + return Expr(f.log(base.expr, num.expr)) + + +def log10(arg: Expr) -> Expr: + """Base 10 logarithm of the argument.""" + return Expr(f.log10(arg.expr)) + + +def log2(arg: Expr) -> Expr: + """Base 2 logarithm of the argument.""" + return Expr(f.log2(arg.expr)) + + +def lower(arg: Expr) -> Expr: + """Converts a string to lowercase.""" + return Expr(f.lower(arg.expr)) + + +def lpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: + """Add left padding to a string. + + Extends the string to length length by prepending the characters fill (a + space by default). If the string is already longer than length then it is + truncated (on the right). + """ + characters = characters if characters is not None else Expr.literal(" ") + return Expr(f.lpad(string.expr, count.expr, characters.expr)) + + +def ltrim(arg: Expr) -> Expr: + """Removes all characters, spaces by default, from the beginning of a string.""" + return Expr(f.ltrim(arg.expr)) + + +def md5(arg: Expr) -> Expr: + """Computes an MD5 128-bit checksum for a string expression.""" + return Expr(f.md5(arg.expr)) + + +def nanvl(x: Expr, y: Expr) -> Expr: + """Returns `x` if `x` is not `NaN`. Otherwise returns `y`.""" + return Expr(f.nanvl(x.expr, y.expr)) + + +def octet_length(arg: Expr) -> Expr: + """Returns the number of bytes of a string.""" + return Expr(f.octet_length(arg.expr)) + + +def overlay( + string: Expr, substring: Expr, start: Expr, length: Expr | None = None +) -> Expr: + """Replace a substring with a new substring. + + Replace the substring of string that starts at the `start`'th character and + extends for `length` characters with new substring. + """ + if length is None: + return Expr(f.overlay(string.expr, substring.expr, start.expr)) + return Expr(f.overlay(string.expr, substring.expr, start.expr, length.expr)) + + +def pi() -> Expr: + """Returns an approximate value of π.""" + return Expr(f.pi()) + + +def position(string: Expr, substring: Expr) -> Expr: + """Finds the position from where the `substring` matches the `string`. + + This is an alias for :func:`strpos`. + """ + return strpos(string, substring) + + +def power(base: Expr, exponent: Expr) -> Expr: + """Returns `base` raised to the power of `exponent`.""" + return Expr(f.power(base.expr, exponent.expr)) + + +def pow(base: Expr, exponent: Expr) -> Expr: + """Returns `base` raised to the power of `exponent`. + + This is an alias of `power`. + """ + return power(base, exponent) + + +def radians(arg: Expr) -> Expr: + """Converts the argument from degrees to radians.""" + return Expr(f.radians(arg.expr)) + + +def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: + """Find if any regular expression (regex) matches exist. + + Tests a string using a regular expression returning true if at least one match, + false otherwise. + """ + if flags is not None: + flags = flags.expr + return Expr(f.regexp_like(string.expr, regex.expr, flags)) + + +def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: + """Perform regular expression (regex) matching. + + Returns an array with each element containing the leftmost-first match of the + corresponding index in `regex` to string in `string`. + """ + if flags is not None: + flags = flags.expr + return Expr(f.regexp_match(string.expr, regex.expr, flags)) + + +def regexp_replace( + string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None +) -> Expr: + """Replaces substring(s) matching a PCRE-like regular expression. + + The full list of supported features and syntax can be found at + + + Supported flags with the addition of 'g' can be found at + + """ + if flags is not None: + flags = flags.expr + return Expr(f.regexp_replace(string.expr, pattern.expr, replacement.expr, flags)) + + +def repeat(string: Expr, n: Expr) -> Expr: + """Repeats the `string` to `n` times.""" + return Expr(f.repeat(string.expr, n.expr)) + + +def replace(string: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces all occurrences of `from` with `to` in the `string`.""" + return Expr(f.replace(string.expr, from_val.expr, to_val.expr)) + + +def reverse(arg: Expr) -> Expr: + """Reverse the string argument.""" + return Expr(f.reverse(arg.expr)) + + +def right(string: Expr, n: Expr) -> Expr: + """Returns the last `n` characters in the `string`.""" + return Expr(f.right(string.expr, n.expr)) + + +def round(value: Expr, decimal_places: Expr = Expr.literal(0)) -> Expr: + """Round the argument to the nearest integer. + + If the optional ``decimal_places`` is specified, round to the nearest number of + decimal places. You can specify a negative number of decimal places. For example + `round(lit(125.2345), lit(-2))` would yield a value of `100.0`. + """ + return Expr(f.round(value.expr, decimal_places.expr)) + + +def rpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: + """Add right padding to a string. + + Extends the string to length length by appending the characters fill (a space + by default). If the string is already longer than length then it is truncated. + """ + characters = characters if characters is not None else Expr.literal(" ") + return Expr(f.rpad(string.expr, count.expr, characters.expr)) + + +def rtrim(arg: Expr) -> Expr: + """Removes all characters, spaces by default, from the end of a string.""" + return Expr(f.rtrim(arg.expr)) + + +def sha224(arg: Expr) -> Expr: + """Computes the SHA-224 hash of a binary string.""" + return Expr(f.sha224(arg.expr)) + + +def sha256(arg: Expr) -> Expr: + """Computes the SHA-256 hash of a binary string.""" + return Expr(f.sha256(arg.expr)) + + +def sha384(arg: Expr) -> Expr: + """Computes the SHA-384 hash of a binary string.""" + return Expr(f.sha384(arg.expr)) + + +def sha512(arg: Expr) -> Expr: + """Computes the SHA-512 hash of a binary string.""" + return Expr(f.sha512(arg.expr)) + + +def signum(arg: Expr) -> Expr: + """Returns the sign of the argument (-1, 0, +1).""" + return Expr(f.signum(arg.expr)) + + +def sin(arg: Expr) -> Expr: + """Returns the sine of the argument.""" + return Expr(f.sin(arg.expr)) + + +def sinh(arg: Expr) -> Expr: + """Returns the hyperbolic sine of the argument.""" + return Expr(f.sinh(arg.expr)) + + +def split_part(string: Expr, delimiter: Expr, index: Expr) -> Expr: + """Split a string and return one part. + + Splits a string based on a delimiter and picks out the desired field based + on the index. + """ + return Expr(f.split_part(string.expr, delimiter.expr, index.expr)) + + +def sqrt(arg: Expr) -> Expr: + """Returns the square root of the argument.""" + return Expr(f.sqrt(arg.expr)) + + +def starts_with(string: Expr, prefix: Expr) -> Expr: + """Returns true if string starts with prefix.""" + return Expr(f.starts_with(string.expr, prefix.expr)) + + +def strpos(string: Expr, substring: Expr) -> Expr: + """Finds the position from where the `substring` matches the `string`.""" + return Expr(f.strpos(string.expr, substring.expr)) + + +def substr(string: Expr, position: Expr) -> Expr: + """Substring from the `position` to the end.""" + return Expr(f.substr(string.expr, position.expr)) + + +def substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr: + """Returns the substring from `string` before `count` occurrences of `delimiter`.""" + return Expr(f.substr_index(string.expr, delimiter.expr, count.expr)) + + +def substring(string: Expr, position: Expr, length: Expr) -> Expr: + """Substring from the `position` with `length` characters.""" + return Expr(f.substring(string.expr, position.expr, length.expr)) + + +def tan(arg: Expr) -> Expr: + """Returns the tangent of the argument.""" + return Expr(f.tan(arg.expr)) + + +def tanh(arg: Expr) -> Expr: + """Returns the hyperbolic tangent of the argument.""" + return Expr(f.tanh(arg.expr)) + + +def to_hex(arg: Expr) -> Expr: + """Converts an integer to a hexadecimal string.""" + return Expr(f.to_hex(arg.expr)) + + +def now() -> Expr: + """Returns the current timestamp in nanoseconds. + + This will use the same value for all instances of now() in same statement. + """ + return Expr(f.now()) + + +def to_timestamp(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a `Timestamp` in nanoseconds. + + For usage of ``formatters`` see the rust chrono package ``strftime`` package. + + [Documentation here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + """ + if formatters is None: + return f.to_timestamp(arg.expr) + + formatters = [f.expr for f in formatters] + return Expr(f.to_timestamp(arg.expr, *formatters)) + + +def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a `Timestamp` in milliseconds. + + See `to_timestamp` for a description on how to use formatters. + """ + return Expr(f.to_timestamp_millis(arg.expr, *formatters)) + + +def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a `Timestamp` in microseconds. + + See `to_timestamp` for a description on how to use formatters. + """ + return Expr(f.to_timestamp_micros(arg.expr, *formatters)) + + +def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a `Timestamp` in nanoseconds. + + See `to_timestamp` for a description on how to use formatters. + """ + return Expr(f.to_timestamp_nanos(arg.expr, *formatters)) + + +def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a `Timestamp` in seconds. + + See `to_timestamp` for a description on how to use formatters. + """ + return Expr(f.to_timestamp_seconds(arg.expr, *formatters)) + + +def to_unixtime(string: Expr, *format_arguments: Expr) -> Expr: + """Converts a string and optional formats to a Unixtime.""" + args = [f.expr for f in format_arguments] + return Expr(f.to_unixtime(string.expr, *args)) + + +def current_date() -> Expr: + """Returns current UTC date as a Date32 value.""" + return Expr(f.current_date()) + + +def current_time() -> Expr: + """Returns current UTC time as a Time64 value.""" + return Expr(f.current_time()) + + +def datepart(part: Expr, date: Expr) -> Expr: + """Return a specified part of a date. + + This is an alias for `date_part`. + """ + return date_part(part, date) + + +def date_part(part: Expr, date: Expr) -> Expr: + """Extracts a subfield from the date.""" + return Expr(f.date_part(part.expr, date.expr)) + + +def date_trunc(part: Expr, date: Expr) -> Expr: + """Truncates the date to a specified level of precision.""" + return Expr(f.date_trunc(part.expr, date.expr)) + + +def datetrunc(part: Expr, date: Expr) -> Expr: + """Truncates the date to a specified level of precision. + + This is an alias for `date_trunc`. + """ + return date_trunc(part, date) + + +def date_bin(stride: Expr, source: Expr, origin: Expr) -> Expr: + """Coerces an arbitrary timestamp to the start of the nearest specified interval.""" + return Expr(f.date_bin(stride.expr, source.expr, origin.expr)) + + +def make_date(year: Expr, month: Expr, day: Expr) -> Expr: + """Make a date from year, month and day component parts.""" + return Expr(f.make_date(year.expr, month.expr, day.expr)) + + +def translate(string: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces the characters in `from_val` with the counterpart in `to_val`.""" + return Expr(f.translate(string.expr, from_val.expr, to_val.expr)) + + +def trim(arg: Expr) -> Expr: + """Removes all characters, spaces by default, from both sides of a string.""" + return Expr(f.trim(arg.expr)) + + +def trunc(num: Expr, precision: Expr | None = None) -> Expr: + """Truncate the number toward zero with optional precision.""" + if precision is not None: + return Expr(f.trunc(num.expr, precision.expr)) + return Expr(f.trunc(num.expr)) + + +def upper(arg: Expr) -> Expr: + """Converts a string to uppercase.""" + return Expr(f.upper(arg.expr)) + + +def make_array(*args: Expr) -> Expr: + """Returns an array using the specified input expressions.""" + args = [arg.expr for arg in args] + return Expr(f.make_array(*args)) + + +def array(*args: Expr) -> Expr: + """Returns an array using the specified input expressions. + + This is an alias for `make_array`. + """ + return make_array(args) + + +def range(start: Expr, stop: Expr, step: Expr) -> Expr: + """Create a list of values in the range between start and stop.""" + return Expr(f.range(start.expr, stop.expr, step.expr)) + + +def uuid(arg: Expr) -> Expr: + """Returns uuid v4 as a string value.""" + return Expr(f.uuid(arg.expr)) + + +def struct(*args: Expr) -> Expr: + """Returns a struct with the given arguments.""" + args = [arg.expr for arg in args] + return Expr(f.struct(*args)) + + +def named_struct(name_pairs: list[(str, Expr)]) -> Expr: + """Returns a struct with the given names and arguments pairs.""" + name_pairs = [[Expr.literal(pair[0]), pair[1]] for pair in name_pairs] + + # flatten + name_pairs = [x.expr for xs in name_pairs for x in xs] + return Expr(f.named_struct(*name_pairs)) + + +def from_unixtime(arg: Expr) -> Expr: + """Converts an integer to RFC3339 timestamp format string.""" + return Expr(f.from_unixtime(arg.expr)) + + +def arrow_typeof(arg: Expr) -> Expr: + """Returns the Arrow type of the expression.""" + return Expr(f.arrow_typeof(arg.expr)) + + +def random() -> Expr: + """Returns a random value in the range `0.0 <= x < 1.0`.""" + return Expr(f.random()) + + +def array_append(array: Expr, element: Expr) -> Expr: + """Appends an element to the end of an array.""" + return Expr(f.array_append(array.expr, element.expr)) + + +def array_push_back(array: Expr, element: Expr) -> Expr: + """Appends an element to the end of an array. + + This is an alias for `array_append`. + """ + return array_append(array, element) + + +def list_append(array: Expr, element: Expr) -> Expr: + """Appends an element to the end of an array. + + This is an alias for `array_append`. + """ + return array_append(array, element) + + +def list_push_back(array: Expr, element: Expr) -> Expr: + """Appends an element to the end of an array. + + This is an alias for `array_append`. + """ + return array_append(array, element) + + +def array_concat(*args: Expr) -> Expr: + """Concatenates the input arrays.""" + args = [arg.expr for arg in args] + return Expr(f.array_concat(*args)) + + +def array_cat(*args: Expr) -> Expr: + """Concatenates the input arrays. + + This is an alias for `array_concat`. + """ + return array_concat(*args) + + +def array_dims(array: Expr) -> Expr: + """Returns an array of the array's dimensions.""" + return Expr(f.array_dims(array.expr)) + + +def array_distinct(array: Expr) -> Expr: + """Returns distinct values from the array after removing duplicates.""" + return Expr(f.array_distinct(array.expr)) + + +def list_distinct(array: Expr) -> Expr: + """Returns distinct values from the array after removing duplicates. + + This is an alias for `array_distinct`. + """ + return array_distinct(array) + + +def list_dims(array: Expr) -> Expr: + """Returns an array of the array's dimensions. + + This is an alias for `array_dims`. + """ + return array_dims(array) + + +def array_element(array: Expr, n: Expr) -> Expr: + """Extracts the element with the index n from the array.""" + return Expr(f.array_element(array.expr, n.expr)) + + +def array_extract(array: Expr, n: Expr) -> Expr: + """Extracts the element with the index n from the array. + + This is an alias for `array_element`. + """ + return array_element(array, n) + + +def list_element(array: Expr, n: Expr) -> Expr: + """Extracts the element with the index n from the array. + + This is an alias for `array_element`. + """ + return array_element(array, n) + + +def list_extract(array: Expr, n: Expr) -> Expr: + """Extracts the element with the index n from the array. + + This is an alias for `array_element`. + """ + return array_element(array, n) + + +def array_length(array: Expr) -> Expr: + """Returns the length of the array.""" + return Expr(f.array_length(array.expr)) + + +def list_length(array: Expr) -> Expr: + """Returns the length of the array. + + This is an alias for `array_length`. + """ + return array_length(array) + + +def array_has(first_array: Expr, second_array: Expr) -> Expr: + """Returns true if the element appears in the first array, otherwise false.""" + return Expr(f.array_has(first_array.expr, second_array.expr)) + + +def array_has_all(first_array: Expr, second_array: Expr) -> Expr: + """Determines if there is complete overlap ``second_array`` in ``first_array``. + + Returns true if each element of the second array appears in the first array. + Otherwise, it returns false. + """ + return Expr(f.array_has_all(first_array.expr, second_array.expr)) + + +def array_has_any(first_array: Expr, second_array: Expr) -> Expr: + """Determine if there is an overlap between ``first_array`` and ``second_array``. + + Returns true if at least one element of the second array appears in the first + array. Otherwise, it returns false. + """ + return Expr(f.array_has_any(first_array.expr, second_array.expr)) + + +def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: + """Return the position of the first occurrence of ``element`` in ``array``.""" + return Expr(f.array_position(array.expr, element.expr, index)) + + +def array_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: + """Return the position of the first occurrence of ``element`` in ``array``. + + This is an alias for `array_position`. + """ + return array_position(array, element, index) + + +def list_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: + """Return the position of the first occurrence of ``element`` in ``array``. + + This is an alias for `array_position`. + """ + return array_position(array, element, index) + + +def list_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: + """Return the position of the first occurrence of ``element`` in ``array``. + + This is an alias for `array_position`. + """ + return array_position(array, element, index) + + +def array_positions(array: Expr, element: Expr) -> Expr: + """Searches for an element in the array and returns all occurrences.""" + return Expr(f.array_positions(array.expr, element.expr)) + + +def list_positions(array: Expr, element: Expr) -> Expr: + """Searches for an element in the array and returns all occurrences. + + This is an alias for `array_positions`. + """ + return array_positions(array, element) + + +def array_ndims(array: Expr) -> Expr: + """Returns the number of dimensions of the array.""" + return Expr(f.array_ndims(array.expr)) + + +def list_ndims(array: Expr) -> Expr: + """Returns the number of dimensions of the array. + + This is an alias for `array_ndims`. + """ + return array_ndims(array) + + +def array_prepend(element: Expr, array: Expr) -> Expr: + """Prepends an element to the beginning of an array.""" + return Expr(f.array_prepend(element.expr, array.expr)) + + +def array_push_front(element: Expr, array: Expr) -> Expr: + """Prepends an element to the beginning of an array. + + This is an alias for `array_prepend`. + """ + return array_prepend(element, array) + + +def list_prepend(element: Expr, array: Expr) -> Expr: + """Prepends an element to the beginning of an array. + + This is an alias for `array_prepend`. + """ + return array_prepend(element, array) + + +def list_push_front(element: Expr, array: Expr) -> Expr: + """Prepends an element to the beginning of an array. + + This is an alias for `array_prepend`. + """ + return array_prepend(element, array) + + +def array_pop_back(array: Expr) -> Expr: + """Returns the array without the last element.""" + return Expr(f.array_pop_back(array.expr)) + + +def array_pop_front(array: Expr) -> Expr: + """Returns the array without the first element.""" + return Expr(f.array_pop_front(array.expr)) + + +def array_remove(array: Expr, element: Expr) -> Expr: + """Removes the first element from the array equal to the given value.""" + return Expr(f.array_remove(array.expr, element.expr)) + + +def list_remove(array: Expr, element: Expr) -> Expr: + """Removes the first element from the array equal to the given value. + + This is an alias for `array_remove`. + """ + return array_remove(array, element) + + +def array_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: + """Removes the first `max` elements from the array equal to the given value.""" + return Expr(f.array_remove_n(array.expr, element.expr, max.expr)) + + +def list_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: + """Removes the first `max` elements from the array equal to the given value. + + This is an alias for `array_remove_n`. + """ + return array_remove_n(array, element, max) + + +def array_remove_all(array: Expr, element: Expr) -> Expr: + """Removes all elements from the array equal to the given value.""" + return Expr(f.array_remove_all(array.expr, element.expr)) + + +def list_remove_all(array: Expr, element: Expr) -> Expr: + """Removes all elements from the array equal to the given value. + + This is an alias for `array_remove_all`. + """ + return array_remove_all(array, element) + + +def array_repeat(element: Expr, count: Expr) -> Expr: + """Returns an array containing `element` `count` times.""" + return Expr(f.array_repeat(element.expr, count.expr)) + + +def array_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces the first occurrence of ``from_val`` with ``to_val``.""" + return Expr(f.array_replace(array.expr, from_val.expr, to_val.expr)) + + +def list_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces the first occurrence of ``from_val`` with ``to_val``. + + This is an alias for `array_replace`. + """ + return array_replace(array, from_val, to_val) + + +def array_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr: + """Replace `n` occurrences of ``from_val`` with ``to_val``. + + Replaces the first `max` occurrences of the specified element with another + specified element. + """ + return Expr(f.array_replace_n(array.expr, from_val.expr, to_val.expr, max.expr)) + + +def list_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr: + """Replace `n` occurrences of ``from_val`` with ``to_val``. + + Replaces the first `max` occurrences of the specified element with another + specified element. + + This is an alias for `array_replace_n`. + """ + return array_replace_n(array, from_val, to_val, max) + + +def array_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces all occurrences of ``from_val`` with ``to_val``.""" + return Expr(f.array_replace_all(array.expr, from_val.expr, to_val.expr)) + + +def list_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces all occurrences of ``from_val`` with ``to_val``. + + This is an alias for `array_replace_all`. + """ + return array_replace_all(array, from_val, to_val) + + +def array_slice( + array: Expr, begin: Expr, end: Expr, stride: Expr | None = None +) -> Expr: + """Returns a slice of the array.""" + if stride is not None: + stride = stride.expr + return Expr(f.array_slice(array.expr, begin.expr, end.expr, stride)) + + +def list_slice(array: Expr, begin: Expr, end: Expr, stride: Expr | None = None) -> Expr: + """Returns a slice of the array. + + This is an alias for `array_slice`. + """ + return array_slice(array, begin, end, stride) + + +def array_intersect(array1: Expr, array2: Expr) -> Expr: + """Returns an array of the elements in the intersection of array1 and array2.""" + return Expr(f.array_intersect(array1.expr, array2.expr)) + + +def list_intersect(array1: Expr, array2: Expr) -> Expr: + """Returns an array of the elements in the intersection of `array1` and `array2`. + + This is an alias for `array_intersect`. + """ + return array_intersect(array1, array2) + + +def array_union(array1: Expr, array2: Expr) -> Expr: + """Returns an array of the elements in the union of array1 and array2. + + Duplicate rows will not be returned. + """ + return Expr(f.array_union(array1.expr, array2.expr)) + + +def list_union(array1: Expr, array2: Expr) -> Expr: + """Returns an array of the elements in the union of array1 and array2. + + Duplicate rows will not be returned. + + This is an alias for `array_union`. + """ + return array_union(array1, array2) + + +def array_except(array1: Expr, array2: Expr) -> Expr: + """Returns an array of the elements that appear in `array1` but not in `array2`.""" + return Expr(f.array_except(array1.expr, array2.expr)) + + +def list_except(array1: Expr, array2: Expr) -> Expr: + """Returns an array of the elements that appear in `array1` but not in the `array2`. + + This is an alias for `array_except`. + """ + return array_except(array1, array2) + + +def array_resize(array: Expr, size: Expr, value: Expr) -> Expr: + """Returns an array with the specified size filled. + + If `size` is greater than the `array` length, the additional entries will be filled + with the given `value`. + """ + return Expr(f.array_resize(array.expr, size.expr, value.expr)) + + +def list_resize(array: Expr, size: Expr, value: Expr) -> Expr: + """Returns an array with the specified size filled. + + If `size` is greater than the `array` length, the additional entries will be + filled with the given `value`. This is an alias for `array_resize`. + """ + return array_resize(array, size, value) + + +def flatten(array: Expr) -> Expr: + """Flattens an array of arrays into a single array.""" + return Expr(f.flatten(array.expr)) + + +# aggregate functions +def approx_distinct(arg: Expr) -> Expr: + """Returns the approximate number of distinct values.""" + return Expr(f.approx_distinct(arg.expr, distinct=True)) + + +def approx_median(arg: Expr, distinct: bool = False) -> Expr: + """Returns the approximate median value.""" + return Expr(f.approx_median(arg.expr, distinct=distinct)) + + +def approx_percentile_cont( + expr: Expr, + percentile: Expr, + num_centroids: int | None = None, + distinct: bool = False, +) -> Expr: + """Returns the value that is approximately at a given percentile of ``expr``.""" + if num_centroids is None: + return Expr( + f.approx_percentile_cont(expr.expr, percentile.expr, distinct=distinct) + ) + + return Expr( + f.approx_percentile_cont( + expr.expr, percentile.expr, num_centroids, distinct=distinct + ) + ) + + +def approx_percentile_cont_with_weight( + arg: Expr, weight: Expr, percentile: Expr, distinct: bool = False +) -> Expr: + """Returns the value of the approximate percentile. + + This function is similar to ``approx_percentile_cont`` except that it uses + the associated associated weights. + """ + return Expr( + f.approx_percentile_cont_with_weight( + arg.expr, weight.expr, percentile.expr, distinct=distinct + ) + ) + + +def array_agg(arg: Expr, distinct: bool = False) -> Expr: + """Aggregate values into an array.""" + return Expr(f.array_agg(arg.expr, distinct=distinct)) + + +def avg(arg: Expr, distinct: bool = False) -> Expr: + """Returns the average value.""" + return Expr(f.avg(arg.expr, distinct=distinct)) + + +def corr(value1: Expr, value2: Expr, distinct: bool = False) -> Expr: + """Returns the correlation coefficient between `value1` and `value2`.""" + return Expr(f.corr(value1.expr, value2.expr, distinct=distinct)) + + +def count(args: Expr | list[Expr] | None = None, distinct: bool = False) -> Expr: + """Returns the number of rows that match the given arguments.""" + if isinstance(args, list): + args = [arg.expr for arg in args] + elif isinstance(args, Expr): + args = [args.expr] + return Expr(f.count(*args, distinct=distinct)) + + +def covar(y: Expr, x: Expr) -> Expr: + """Computes the sample covariance. + + This is an alias for `covar_samp`. + """ + return Expr(f.covar(y.expr, x.expr)) + + +def covar_pop(y: Expr, x: Expr) -> Expr: + """Computes the population covariance.""" + return Expr(f.covar_pop(y.expr, x.expr)) + + +def covar_samp(y: Expr, x: Expr) -> Expr: + """Computes the sample covariance.""" + return Expr(f.covar_samp(y.expr, x.expr)) + + +def grouping(arg: Expr, distinct: bool = False) -> Expr: + """Indicates if the expression is aggregated or not. + + Returns 1 if the value of the argument is aggregated, 0 if not. + """ + return Expr(f.grouping([arg.expr], distinct=distinct)) + + +def max(arg: Expr, distinct: bool = False) -> Expr: + """Returns the maximum value of the arugment.""" + return Expr(f.max(arg.expr, distinct=distinct)) + + +def mean(arg: Expr, distinct: bool = False) -> Expr: + """Returns the average (mean) value of the argument. + + This is an alias for `avg`. + """ + return avg(arg, distinct) + + +def median(arg: Expr) -> Expr: + """Computes the median of a set of numbers.""" + return Expr(f.median(arg.expr)) + + +def min(arg: Expr, distinct: bool = False) -> Expr: + """Returns the minimum value of the argument.""" + return Expr(f.min(arg.expr, distinct=distinct)) + + +def sum(arg: Expr) -> Expr: + """Computes the sum of a set of numbers.""" + return Expr(f.sum(arg.expr)) + + +def stddev(arg: Expr, distinct: bool = False) -> Expr: + """Computes the standard deviation of the argument.""" + return Expr(f.stddev(arg.expr, distinct=distinct)) + + +def stddev_pop(arg: Expr, distinct: bool = False) -> Expr: + """Computes the population standard deviation of the argument.""" + return Expr(f.stddev_pop(arg.expr, distinct=distinct)) + + +def stddev_samp(arg: Expr, distinct: bool = False) -> Expr: + """Computes the sample standard deviation of the argument. + + This is an alias for `stddev`. + """ + return stddev(arg, distinct) + + +def var(arg: Expr) -> Expr: + """Computes the sample variance of the argument. + + This is an alias for `var_samp`. + """ + return var_samp(arg) + + +def var_pop(arg: Expr, distinct: bool = False) -> Expr: + """Computes the population variance of the argument.""" + return Expr(f.var_pop(arg.expr, distinct=distinct)) + + +def var_samp(arg: Expr) -> Expr: + """Computes the sample variance of the argument.""" + return Expr(f.var_samp(arg.expr)) + + +def regr_avgx(y: Expr, x: Expr, distinct: bool = False) -> Expr: + """Computes the average of the independent variable `x`. + + Only non-null pairs of the inputs are evaluated. + """ + return Expr(f.regr_avgx[y.expr, x.expr], distinct) + + +def regr_avgy(y: Expr, x: Expr, distinct: bool = False) -> Expr: + """Computes the average of the dependent variable ``y``. + + Only non-null pairs of the inputs are evaluated. + """ + return Expr(f.regr_avgy[y.expr, x.expr], distinct) + + +def regr_count(y: Expr, x: Expr, distinct: bool = False) -> Expr: + """Counts the number of rows in which both expressions are not null.""" + return Expr(f.regr_count[y.expr, x.expr], distinct) + + +def regr_intercept(y: Expr, x: Expr, distinct: bool = False) -> Expr: + """Computes the intercept from the linear regression.""" + return Expr(f.regr_intercept[y.expr, x.expr], distinct) + + +def regr_r2(y: Expr, x: Expr, distinct: bool = False) -> Expr: + """Computes the R-squared value from linear regression.""" + return Expr(f.regr_r2[y.expr, x.expr], distinct) + + +def regr_slope(y: Expr, x: Expr, distinct: bool = False) -> Expr: + """Computes the slope from linear regression.""" + return Expr(f.regr_slope[y.expr, x.expr], distinct) + + +def regr_sxx(y: Expr, x: Expr, distinct: bool = False) -> Expr: + """Computes the sum of squares of the independent variable `x`.""" + return Expr(f.regr_sxx[y.expr, x.expr], distinct) + + +def regr_sxy(y: Expr, x: Expr, distinct: bool = False) -> Expr: + """Computes the sum of products of pairs of numbers.""" + return Expr(f.regr_sxy[y.expr, x.expr], distinct) + + +def regr_syy(y: Expr, x: Expr, distinct: bool = False) -> Expr: + """Computes the sum of squares of the dependent variable `y`.""" + return Expr(f.regr_syy[y.expr, x.expr], distinct) + + +def first_value( + arg: Expr, + distinct: bool = False, + filter: bool = None, + order_by: Expr | None = None, + null_treatment: common.NullTreatment | None = None, +) -> Expr: + """Returns the first value in a group of values.""" + return Expr( + f.first_value( + arg.expr, + distinct=distinct, + filter=filter, + order_by=order_by, + null_treatment=null_treatment, + ) + ) + + +def last_value( + arg: Expr, + distinct: bool = False, + filter: bool = None, + order_by: Expr | None = None, + null_treatment: common.NullTreatment | None = None, +) -> Expr: + """Returns the last value in a group of values.""" + return Expr( + f.last_value( + arg.expr, + distinct=distinct, + filter=filter, + order_by=order_by, + null_treatment=null_treatment, + ) + ) + + +def bit_and(*args: Expr, distinct: bool = False) -> Expr: + """Computes the bitwise AND of the argument.""" + args = [arg.expr for arg in args] + return Expr(f.bit_and(*args, distinct=distinct)) + + +def bit_or(*args: Expr, distinct: bool = False) -> Expr: + """Computes the bitwise OR of the argument.""" + args = [arg.expr for arg in args] + return Expr(f.bit_or(*args, distinct=distinct)) + + +def bit_xor(*args: Expr, distinct: bool = False) -> Expr: + """Computes the bitwise XOR of the argument.""" + args = [arg.expr for arg in args] + return Expr(f.bit_xor(*args, distinct=distinct)) + + +def bool_and(*args: Expr, distinct: bool = False) -> Expr: + """Computes the boolean AND of the arugment.""" + args = [arg.expr for arg in args] + return Expr(f.bool_and(*args, distinct=distinct)) + + +def bool_or(*args: Expr, distinct: bool = False) -> Expr: + """Computes the boolean OR of the arguement.""" + args = [arg.expr for arg in args] + return Expr(f.bool_or(*args, distinct=distinct)) diff --git a/python/datafusion/input/__init__.py b/python/datafusion/input/__init__.py index 27e39b8ca..f85ce21f0 100644 --- a/python/datafusion/input/__init__.py +++ b/python/datafusion/input/__init__.py @@ -15,6 +15,11 @@ # specific language governing permissions and limitations # under the License. +"""This package provides for input sources. + +The primary class used within DataFusion is ``LocationInputPlugin``. +""" + from .location import LocationInputPlugin __all__ = [ diff --git a/python/datafusion/input/base.py b/python/datafusion/input/base.py index efcaf7697..4eba19784 100644 --- a/python/datafusion/input/base.py +++ b/python/datafusion/input/base.py @@ -15,6 +15,11 @@ # specific language governing permissions and limitations # under the License. +"""This module provides ``BaseInputSource``. + +A user can extend this to provide a custom input source. +""" + from abc import ABC, abstractmethod from typing import Any @@ -22,18 +27,22 @@ class BaseInputSource(ABC): - """ - If a consuming library would like to provider their own InputSource - this is the class they should extend to write their own. Once - completed the Plugin InputSource can be registered with the + """Base Input Source class. + + If a consuming library would like to provider their own InputSource this is + the class they should extend to write their own. + + Once completed the Plugin InputSource can be registered with the SessionContext to ensure that it will be used in order to obtain the SqlTable information from the custom datasource. """ @abstractmethod def is_correct_input(self, input_item: Any, table_name: str, **kwargs) -> bool: + """Returns `True` if the input is valid.""" pass @abstractmethod def build_table(self, input_item: Any, table_name: str, **kwarg) -> SqlTable: + """Create a table from the input source.""" pass diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index 16e632d1b..566a63da9 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +"""The default input source for DataFusion.""" + import os import glob from typing import Any @@ -24,12 +26,13 @@ class LocationInputPlugin(BaseInputSource): - """ - Input Plugin for everything, which can be read - in from a file (on disk, remote etc.) + """Input Plugin for everything. + + This can be read in from a file (on disk, remote etc.). """ def is_correct_input(self, input_item: Any, table_name: str, **kwargs): + """Returns `True` if the input is valid.""" return isinstance(input_item, str) def build_table( @@ -38,6 +41,7 @@ def build_table( table_name: str, **kwargs, ) -> SqlTable: + """Create a table from the input source.""" _, extension = os.path.splitext(input_file) format = extension.lstrip(".").lower() num_rows = 0 # Total number of rows in the file. Used for statistics diff --git a/python/datafusion/object_store.py b/python/datafusion/object_store.py index 70ecbd2bb..a9bb83d29 100644 --- a/python/datafusion/object_store.py +++ b/python/datafusion/object_store.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Object store functionality.""" from ._internal import object_store diff --git a/python/datafusion/py.typed b/python/datafusion/py.typed new file mode 100644 index 000000000..d216be4dd --- /dev/null +++ b/python/datafusion/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. \ No newline at end of file diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py new file mode 100644 index 000000000..dcfd55485 --- /dev/null +++ b/python/datafusion/record_batch.py @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""This module provides the classes for handling record batches. + +These are typically the result of dataframe `execute_stream` operations. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import pyarrow + import datafusion._internal as df_internal + import typing_extensions + + +class RecordBatch: + """This class is essentially a wrapper for ``pyarrow.RecordBatch``.""" + + def __init__(self, record_batch: df_internal.RecordBatch) -> None: + """This constructor is generally not called by the end user. + + See the ``RecordBatchStream`` iterator for generating this class. + """ + self.record_batch = record_batch + + def to_pyarrow(self) -> pyarrow.RecordBatch: + """Convert to pyarrow ``RecordBatch``.""" + return self.record_batch.to_pyarrow() + + +class RecordBatchStream: + """This class represents a stream of record batches. + + These are typically the result of a ``DataFrame::execute_stream`` operation. + """ + + def __init__(self, record_batch_stream: df_internal.RecordBatchStream) -> None: + """This constructor is typically not called by the end user.""" + self.rbs = record_batch_stream + + def next(self) -> RecordBatch | None: + """See ``__next__`` for the iterator function.""" + try: + next_batch = next(self) + except StopIteration: + return None + + return next_batch + + def __next__(self) -> RecordBatch: + """Iterator function.""" + next_batch = next(self.rbs) + return RecordBatch(next_batch) + + def __iter__(self) -> typing_extensions.Self: + """Iterator function.""" + return self diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index eff809a0c..a199dd733 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -15,9 +15,171 @@ # specific language governing permissions and limitations # under the License. +"""This module provides support for using substrait with datafusion. -from ._internal import substrait +For additional information about substrait, see https://substrait.io/ for more +information about substrait. +""" +from __future__ import annotations -def __getattr__(name): - return getattr(substrait, name) +from ._internal import substrait as substrait_internal + +from typing import TYPE_CHECKING +from typing_extensions import deprecated +import pathlib + +if TYPE_CHECKING: + from datafusion.context import SessionContext + from datafusion._internal import LogicalPlan + + +class Plan: + """A class representing an encodable substrait plan.""" + + def __init__(self, plan: substrait_internal.Plan) -> None: + """Create a substrait plan. + + The user should not have to call this constructor directly. Rather, it + should be created via `Serde` or `Producer` classes in this module. + """ + self.plan_internal = plan + + def encode(self) -> bytes: + """Encode the plan to bytes. + + Returns: + Encoded plan. + """ + return self.plan_internal.encode() + + +@deprecated("Use `Plan` instead.") +class plan(Plan): + """See `Plan`.""" + + pass + + +class Serde: + """Provides the ``Substrait`` serialization and deserialization.""" + + @staticmethod + def serialize(sql: str, ctx: SessionContext, path: str | pathlib.Path) -> None: + """Serialize a SQL query to a Substrait plan and write it to a file. + + Args: + sql:SQL query to serialize. + ctx: SessionContext to use. + path: Path to write the Substrait plan to. + """ + return substrait_internal.serde.serialize(sql, ctx.ctx, str(path)) + + @staticmethod + def serialize_to_plan(sql: str, ctx: SessionContext) -> Plan: + """Serialize a SQL query to a Substrait plan. + + Args: + sql: SQL query to serialize. + ctx: SessionContext to use. + + Returns: + Substrait plan. + """ + return Plan(substrait_internal.serde.serialize_to_plan(sql, ctx.ctx)) + + @staticmethod + def serialize_bytes(sql: str, ctx: SessionContext) -> bytes: + """Serialize a SQL query to a Substrait plan as bytes. + + Args: + sql: SQL query to serialize. + ctx: SessionContext to use. + + Returns: + Substrait plan as bytes. + """ + return substrait_internal.serde.serialize_bytes(sql, ctx.ctx) + + @staticmethod + def deserialize(path: str | pathlib.Path) -> Plan: + """Deserialize a Substrait plan from a file. + + Args: + path: Path to read the Substrait plan from. + + Returns: + Substrait plan. + """ + return Plan(substrait_internal.serde.deserialize(str(path))) + + @staticmethod + def deserialize_bytes(proto_bytes: bytes) -> Plan: + """Deserialize a Substrait plan from bytes. + + Args: + proto_bytes: Bytes to read the Substrait plan from. + + Returns: + Substrait plan. + """ + return Plan(substrait_internal.serde.deserialize_bytes(proto_bytes)) + + +@deprecated("Use `Serde` instead.") +class serde(Serde): + """See `Serde` instead.""" + + pass + + +class Producer: + """Generates substrait plans from a logical plan.""" + + @staticmethod + def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan: + """Convert a DataFusion LogicalPlan to a Substrait plan. + + Args: + logical_plan: LogicalPlan to convert. + ctx: SessionContext to use. + + Returns: + Substrait plan. + """ + return Plan( + substrait_internal.producer.to_substrait_plan(logical_plan, ctx.ctx) + ) + + +@deprecated("Use `Producer` instead.") +class producer(Producer): + """Use `Producer` instead.""" + + pass + + +class Consumer: + """Generates a logical plan from a substrait plan.""" + + @staticmethod + def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan: + """Convert a Substrait plan to a DataFusion LogicalPlan. + + Args: + ctx: SessionContext to use. + plan: Substrait plan to convert. + + Returns: + LogicalPlan. + """ + return substrait_internal.consumer.from_substrait_plan( + ctx.ctx, plan.plan_internal + ) + + +@deprecated("Use `Consumer` instead.") +class consumer(Consumer): + """Use `Consumer` instead.""" + + pass diff --git a/python/datafusion/tests/conftest.py b/python/datafusion/tests/conftest.py index a4eec41e2..1cc07e500 100644 --- a/python/datafusion/tests/conftest.py +++ b/python/datafusion/tests/conftest.py @@ -18,6 +18,7 @@ import pytest from datafusion import SessionContext import pyarrow as pa +from pyarrow.csv import write_csv @pytest.fixture @@ -37,7 +38,7 @@ def database(ctx, tmp_path): ], names=["int", "str", "float"], ) - pa.csv.write_csv(table, path) + write_csv(table, path) ctx.register_csv("csv", path) ctx.register_csv("csv1", str(path)) diff --git a/python/datafusion/tests/test_context.py b/python/datafusion/tests/test_context.py index abc324db8..8373659b0 100644 --- a/python/datafusion/tests/test_context.py +++ b/python/datafusion/tests/test_context.py @@ -17,6 +17,7 @@ import gzip import os import datetime as dt +import pathlib import pyarrow as pa import pyarrow.dataset as ds @@ -37,6 +38,36 @@ def test_create_context_no_args(): SessionContext() +@pytest.mark.parametrize("path_to_str", (True, False)) +def test_runtime_configs(tmp_path, path_to_str): + path1 = tmp_path / "dir1" + path2 = tmp_path / "dir2" + + path1 = str(path1) if path_to_str else path1 + path2 = str(path2) if path_to_str else path2 + + runtime = RuntimeConfig().with_disk_manager_specified(path1, path2) + config = SessionConfig().with_default_catalog_and_schema("foo", "bar") + ctx = SessionContext(config, runtime) + assert ctx is not None + + db = ctx.catalog("foo").database("bar") + assert db is not None + + +@pytest.mark.parametrize("path_to_str", (True, False)) +def test_temporary_files(tmp_path, path_to_str): + path = str(tmp_path) if path_to_str else tmp_path + + runtime = RuntimeConfig().with_temp_file_path(path) + config = SessionConfig().with_default_catalog_and_schema("foo", "bar") + ctx = SessionContext(config, runtime) + assert ctx is not None + + db = ctx.catalog("foo").database("bar") + assert db is not None + + def test_create_context_with_all_valid_args(): runtime = RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( @@ -68,7 +99,7 @@ def test_register_record_batches(ctx): ctx.register_record_batches("t", [[batch]]) - assert ctx.tables() == {"t"} + assert ctx.catalog().database().names() == {"t"} result = ctx.sql("SELECT a+b, a-b FROM t").collect() @@ -84,7 +115,7 @@ def test_create_dataframe_registers_unique_table_name(ctx): ) df = ctx.create_dataframe([[batch]]) - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert len(tables) == 1 @@ -104,7 +135,7 @@ def test_create_dataframe_registers_with_defined_table_name(ctx): ) df = ctx.create_dataframe([[batch]], name="tbl") - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert len(tables) == 1 @@ -118,11 +149,11 @@ def test_from_arrow_table(ctx): # convert to DataFrame df = ctx.from_arrow_table(table) - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert len(tables) == 1 - assert type(df) == DataFrame + assert isinstance(df, DataFrame) assert set(df.schema().names) == {"a", "b"} assert df.collect()[0].num_rows == 3 @@ -134,7 +165,7 @@ def test_from_arrow_table_with_name(ctx): # convert to DataFrame with optional name df = ctx.from_arrow_table(table, name="tbl") - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert tables[0] == "tbl" @@ -147,7 +178,7 @@ def test_from_arrow_table_empty(ctx): # convert to DataFrame df = ctx.from_arrow_table(table) - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert len(tables) == 1 @@ -162,7 +193,7 @@ def test_from_arrow_table_empty_no_schema(ctx): # convert to DataFrame df = ctx.from_arrow_table(table) - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert len(tables) == 1 @@ -180,11 +211,11 @@ def test_from_pylist(ctx): ] df = ctx.from_pylist(data) - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert len(tables) == 1 - assert type(df) == DataFrame + assert isinstance(df, DataFrame) assert set(df.schema().names) == {"a", "b"} assert df.collect()[0].num_rows == 3 @@ -194,11 +225,11 @@ def test_from_pydict(ctx): data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = ctx.from_pydict(data) - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert len(tables) == 1 - assert type(df) == DataFrame + assert isinstance(df, DataFrame) assert set(df.schema().names) == {"a", "b"} assert df.collect()[0].num_rows == 3 @@ -210,11 +241,11 @@ def test_from_pandas(ctx): pandas_df = pd.DataFrame(data) df = ctx.from_pandas(pandas_df) - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert len(tables) == 1 - assert type(df) == DataFrame + assert isinstance(df, DataFrame) assert set(df.schema().names) == {"a", "b"} assert df.collect()[0].num_rows == 3 @@ -226,11 +257,11 @@ def test_from_polars(ctx): polars_df = pd.DataFrame(data) df = ctx.from_polars(polars_df) - tables = list(ctx.tables()) + tables = list(ctx.catalog().database().names()) assert df assert len(tables) == 1 - assert type(df) == DataFrame + assert isinstance(df, DataFrame) assert set(df.schema().names) == {"a", "b"} assert df.collect()[0].num_rows == 3 @@ -273,7 +304,7 @@ def test_register_dataset(ctx): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.tables() == {"t"} + assert ctx.catalog().database().names() == {"t"} result = ctx.sql("SELECT a+b, a-b FROM t").collect() @@ -290,7 +321,7 @@ def test_dataset_filter(ctx, capfd): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.tables() == {"t"} + assert ctx.catalog().database().names() == {"t"} df = ctx.sql("SELECT a+b, a-b FROM t WHERE a BETWEEN 2 and 3 AND b > 5") # Make sure the filter was pushed down in Physical Plan @@ -370,7 +401,7 @@ def test_dataset_filter_nested_data(ctx): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.tables() == {"t"} + assert ctx.catalog().database().names() == {"t"} df = ctx.table("t") @@ -468,13 +499,23 @@ def test_read_csv_compressed(ctx, tmp_path): def test_read_parquet(ctx): - csv_df = ctx.read_parquet(path="parquet/data/alltypes_plain.parquet") - csv_df.show() + parquet_df = ctx.read_parquet(path="parquet/data/alltypes_plain.parquet") + parquet_df.show() + assert parquet_df is not None + + path = pathlib.Path.cwd() / "parquet/data/alltypes_plain.parquet" + parquet_df = ctx.read_parquet(path=path) + assert parquet_df is not None def test_read_avro(ctx): - csv_df = ctx.read_avro(path="testing/data/avro/alltypes_plain.avro") - csv_df.show() + avro_df = ctx.read_avro(path="testing/data/avro/alltypes_plain.avro") + avro_df.show() + assert avro_df is not None + + path = pathlib.Path.cwd() / "testing/data/avro/alltypes_plain.avro" + avro_df = ctx.read_avro(path=path) + assert avro_df is not None def test_create_sql_options(): diff --git a/python/datafusion/tests/test_dataframe.py b/python/datafusion/tests/test_dataframe.py index 2f6a818ea..25875da77 100644 --- a/python/datafusion/tests/test_dataframe.py +++ b/python/datafusion/tests/test_dataframe.py @@ -17,6 +17,7 @@ import os import pyarrow as pa +from pyarrow.csv import write_csv import pyarrow.parquet as pq import pytest @@ -96,6 +97,16 @@ def test_select(df): assert result.column(1) == pa.array([-3, -3, -3]) +def test_select_mixed_expr_string(df): + df = df.select_columns(column("b"), "a") + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pa.array([4, 5, 6]) + assert result.column(1) == pa.array([1, 2, 3]) + + def test_select_columns(df): df = df.select_columns("b", "a") @@ -107,17 +118,29 @@ def test_select_columns(df): def test_filter(df): - df = df.filter(column("a") > literal(2)).select( + df1 = df.filter(column("a") > literal(2)).select( column("a") + column("b"), column("a") - column("b"), ) # execute and collect the first (and only) batch - result = df.collect()[0] + result = df1.collect()[0] assert result.column(0) == pa.array([9]) assert result.column(1) == pa.array([-3]) + df.show() + # verify that if there is no filter applied, internal dataframe is unchanged + df2 = df.filter() + assert df.df == df2.df + + df3 = df.filter(column("a") > literal(1), column("b") != literal(6)) + result = df3.collect()[0] + + assert result.column(0) == pa.array([2]) + assert result.column(1) == pa.array([5]) + assert result.column(2) == pa.array([5]) + def test_sort(df): df = df.sort(column("b").sort(ascending=False)) @@ -175,7 +198,7 @@ def test_with_column_renamed(df): def test_unnest(nested_df): - nested_df = nested_df.unnest_column("a") + nested_df = nested_df.unnest_columns("a") # execute and collect the first (and only) batch result = nested_df.collect()[0] @@ -185,7 +208,7 @@ def test_unnest(nested_df): def test_unnest_without_nulls(nested_df): - nested_df = nested_df.unnest_column("a", preserve_nulls=False) + nested_df = nested_df.unnest_columns("a", preserve_nulls=False) # execute and collect the first (and only) batch result = nested_df.collect()[0] @@ -379,7 +402,7 @@ def test_get_dataframe(tmp_path): ], names=["int", "str", "float"], ) - pa.csv.write_csv(table, path) + write_csv(table, path) ctx.register_csv("csv", path) @@ -611,7 +634,7 @@ def test_to_pandas(df): # Convert datafusion dataframe to pandas dataframe pandas_df = df.to_pandas() - assert type(pandas_df) == pd.DataFrame + assert isinstance(pandas_df, pd.DataFrame) assert pandas_df.shape == (3, 3) assert set(pandas_df.columns) == {"a", "b", "c"} @@ -622,7 +645,7 @@ def test_empty_to_pandas(df): # Convert empty datafusion dataframe to pandas dataframe pandas_df = df.limit(0).to_pandas() - assert type(pandas_df) == pd.DataFrame + assert isinstance(pandas_df, pd.DataFrame) assert pandas_df.shape == (0, 3) assert set(pandas_df.columns) == {"a", "b", "c"} @@ -633,7 +656,7 @@ def test_to_polars(df): # Convert datafusion dataframe to polars dataframe polars_df = df.to_polars() - assert type(polars_df) == pl.DataFrame + assert isinstance(polars_df, pl.DataFrame) assert polars_df.shape == (3, 3) assert set(polars_df.columns) == {"a", "b", "c"} @@ -644,7 +667,7 @@ def test_empty_to_polars(df): # Convert empty datafusion dataframe to polars dataframe polars_df = df.limit(0).to_polars() - assert type(polars_df) == pl.DataFrame + assert isinstance(polars_df, pl.DataFrame) assert polars_df.shape == (0, 3) assert set(polars_df.columns) == {"a", "b", "c"} @@ -652,13 +675,15 @@ def test_empty_to_polars(df): def test_to_arrow_table(df): # Convert datafusion dataframe to pyarrow Table pyarrow_table = df.to_arrow_table() - assert type(pyarrow_table) == pa.Table + assert isinstance(pyarrow_table, pa.Table) assert pyarrow_table.shape == (3, 3) assert set(pyarrow_table.column_names) == {"a", "b", "c"} def test_execute_stream(df): stream = df.execute_stream() + for s in stream: + print(type(s)) assert all(batch is not None for batch in stream) assert not list(stream) # after one iteration the generator must be exhausted @@ -690,7 +715,7 @@ def test_execute_stream_partitioned(df): def test_empty_to_arrow_table(df): # Convert empty datafusion dataframe to pyarrow Table pyarrow_table = df.limit(0).to_arrow_table() - assert type(pyarrow_table) == pa.Table + assert isinstance(pyarrow_table, pa.Table) assert pyarrow_table.shape == (0, 3) assert set(pyarrow_table.column_names) == {"a", "b", "c"} @@ -736,8 +761,35 @@ def test_describe(df): } -def test_write_parquet(df, tmp_path): - path = tmp_path +@pytest.mark.parametrize("path_to_str", (True, False)) +def test_write_csv(ctx, df, tmp_path, path_to_str): + path = str(tmp_path) if path_to_str else tmp_path + + df.write_csv(path, with_header=True) + + ctx.register_csv("csv", path) + result = ctx.table("csv").to_pydict() + expected = df.to_pydict() + + assert result == expected + + +@pytest.mark.parametrize("path_to_str", (True, False)) +def test_write_json(ctx, df, tmp_path, path_to_str): + path = str(tmp_path) if path_to_str else tmp_path + + df.write_json(path) + + ctx.register_json("json", path) + result = ctx.table("json").to_pydict() + expected = df.to_pydict() + + assert result == expected + + +@pytest.mark.parametrize("path_to_str", (True, False)) +def test_write_parquet(df, tmp_path, path_to_str): + path = str(tmp_path) if path_to_str else tmp_path df.write_parquet(str(path)) result = pq.read_table(str(path)).to_pydict() @@ -795,3 +847,15 @@ def test_write_compressed_parquet_missing_compression_level(df, tmp_path, compre with pytest.raises(ValueError): df.write_parquet(str(path), compression=compression) + + +# ctx = SessionContext() + +# # create a RecordBatch and a new DataFrame from it +# batch = pa.RecordBatch.from_arrays( +# [pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([8, 5, 8])], +# names=["a", "b", "c"], +# ) + +# df = ctx.create_dataframe([[batch]]) +# test_execute_stream(df) diff --git a/python/datafusion/tests/test_expr.py b/python/datafusion/tests/test_expr.py index 73f7d087a..c9f0e98d5 100644 --- a/python/datafusion/tests/test_expr.py +++ b/python/datafusion/tests/test_expr.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from datafusion import SessionContext +from datafusion import SessionContext, col from datafusion.expr import Column, Literal, BinaryExpr, AggregateFunction from datafusion.expr import ( Projection, @@ -25,6 +25,7 @@ Sort, TableScan, ) +import pyarrow import pytest @@ -116,3 +117,25 @@ def test_sort(test_ctx): plan = plan.to_variant() assert isinstance(plan, Sort) + + +def test_relational_expr(test_ctx): + ctx = SessionContext() + + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array(["alpha", "beta", "gamma"])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]], name="batch_array") + + assert df.filter(col("a") == 1).count() == 1 + assert df.filter(col("a") != 1).count() == 2 + assert df.filter(col("a") >= 1).count() == 3 + assert df.filter(col("a") > 1).count() == 2 + assert df.filter(col("a") <= 3).count() == 3 + assert df.filter(col("a") < 3).count() == 2 + + assert df.filter(col("b") == "beta").count() == 1 + assert df.filter(col("b") != "beta").count() == 2 + + assert df.filter(col("a") == "beta").count() == 0 diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index 449f706c3..2384b6ab8 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -54,12 +54,11 @@ def test_named_struct(df): df = df.with_column( "d", f.named_struct( - literal("a"), - column("a"), - literal("b"), - column("b"), - literal("c"), - column("c"), + [ + ("a", column("a")), + ("b", column("b")), + ("c", column("c")), + ] ), ) @@ -97,9 +96,7 @@ def test_literal(df): def test_lit_arith(df): - """ - Test literals with arithmetic operations - """ + """Test literals with arithmetic operations""" df = df.select(literal(1) + column("b"), f.concat(column("a"), literal("!"))) result = df.collect() assert len(result) == 1 @@ -140,6 +137,7 @@ def test_math_functions(): f.power(col_v, literal(pa.scalar(3))), f.pow(col_v, literal(pa.scalar(4))), f.round(col_v), + f.round(col_v, literal(pa.scalar(3))), f.sqrt(col_v), f.signum(col_v), f.trunc(col_v), @@ -183,29 +181,30 @@ def test_math_functions(): np.testing.assert_array_almost_equal(result.column(15), np.power(values, 3)) np.testing.assert_array_almost_equal(result.column(16), np.power(values, 4)) np.testing.assert_array_almost_equal(result.column(17), np.round(values)) - np.testing.assert_array_almost_equal(result.column(18), np.sqrt(values)) - np.testing.assert_array_almost_equal(result.column(19), np.sign(values)) - np.testing.assert_array_almost_equal(result.column(20), np.trunc(values)) - np.testing.assert_array_almost_equal(result.column(21), np.arcsinh(values)) - np.testing.assert_array_almost_equal(result.column(22), np.arccosh(values)) - np.testing.assert_array_almost_equal(result.column(23), np.arctanh(values)) - np.testing.assert_array_almost_equal(result.column(24), np.cbrt(values)) - np.testing.assert_array_almost_equal(result.column(25), np.cosh(values)) - np.testing.assert_array_almost_equal(result.column(26), np.degrees(values)) - np.testing.assert_array_almost_equal(result.column(27), np.gcd(9, 3)) - np.testing.assert_array_almost_equal(result.column(28), np.lcm(6, 4)) + np.testing.assert_array_almost_equal(result.column(18), np.round(values, 3)) + np.testing.assert_array_almost_equal(result.column(19), np.sqrt(values)) + np.testing.assert_array_almost_equal(result.column(20), np.sign(values)) + np.testing.assert_array_almost_equal(result.column(21), np.trunc(values)) + np.testing.assert_array_almost_equal(result.column(22), np.arcsinh(values)) + np.testing.assert_array_almost_equal(result.column(23), np.arccosh(values)) + np.testing.assert_array_almost_equal(result.column(24), np.arctanh(values)) + np.testing.assert_array_almost_equal(result.column(25), np.cbrt(values)) + np.testing.assert_array_almost_equal(result.column(26), np.cosh(values)) + np.testing.assert_array_almost_equal(result.column(27), np.degrees(values)) + np.testing.assert_array_almost_equal(result.column(28), np.gcd(9, 3)) + np.testing.assert_array_almost_equal(result.column(29), np.lcm(6, 4)) np.testing.assert_array_almost_equal( - result.column(29), np.where(np.isnan(na_values), 5, na_values) + result.column(30), np.where(np.isnan(na_values), 5, na_values) ) - np.testing.assert_array_almost_equal(result.column(30), np.pi) - np.testing.assert_array_almost_equal(result.column(31), np.radians(values)) - np.testing.assert_array_almost_equal(result.column(32), np.sinh(values)) - np.testing.assert_array_almost_equal(result.column(33), np.tanh(values)) - np.testing.assert_array_almost_equal(result.column(34), math.factorial(6)) - np.testing.assert_array_almost_equal(result.column(35), np.isnan(na_values)) - np.testing.assert_array_almost_equal(result.column(36), na_values == 0) + np.testing.assert_array_almost_equal(result.column(31), np.pi) + np.testing.assert_array_almost_equal(result.column(32), np.radians(values)) + np.testing.assert_array_almost_equal(result.column(33), np.sinh(values)) + np.testing.assert_array_almost_equal(result.column(34), np.tanh(values)) + np.testing.assert_array_almost_equal(result.column(35), math.factorial(6)) + np.testing.assert_array_almost_equal(result.column(36), np.isnan(na_values)) + np.testing.assert_array_almost_equal(result.column(37), na_values == 0) np.testing.assert_array_almost_equal( - result.column(37), np.emath.logn(3, values + 1.0) + result.column(38), np.emath.logn(3, values + 1.0) ) @@ -591,7 +590,12 @@ def test_string_functions(df): f.trim(column("c")), f.upper(column("c")), f.ends_with(column("a"), literal("llo")), + f.overlay(column("a"), literal("--"), literal(2)), + f.regexp_like(column("a"), literal("(ell|orl)")), + f.regexp_match(column("a"), literal("(ell|orl)")), + f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), ) + result = df.collect() assert len(result) == 1 result = result[0] @@ -632,6 +636,10 @@ def test_string_functions(df): assert result.column(26) == pa.array(["hello", "world", "!"]) assert result.column(27) == pa.array(["HELLO ", " WORLD ", " !"]) assert result.column(28) == pa.array([True, False, False]) + assert result.column(29) == pa.array(["H--lo", "W--ld", "--"]) + assert result.column(30) == pa.array([True, True, False]) + assert result.column(31) == pa.array([["ell"], ["orl"], None]) + assert result.column(32) == pa.array(["H-o", "W-d", "!"]) def test_hash_functions(df): diff --git a/python/datafusion/tests/test_imports.py b/python/datafusion/tests/test_imports.py index bd4e7c31d..3d324fb62 100644 --- a/python/datafusion/tests/test_imports.py +++ b/python/datafusion/tests/test_imports.py @@ -94,13 +94,24 @@ def test_datafusion_python_version(): def test_class_module_is_datafusion(): + # context for klass in [ SessionContext, + ]: + assert klass.__module__ == "datafusion.context" + + # dataframe + for klass in [ DataFrame, - ScalarUDF, + ]: + assert klass.__module__ == "datafusion.dataframe" + + # udf + for klass in [ AggregateUDF, + ScalarUDF, ]: - assert klass.__module__ == "datafusion" + assert klass.__module__ == "datafusion.udf" # expressions for klass in [Expr, Column, Literal, BinaryExpr, AggregateFunction]: diff --git a/python/datafusion/tests/test_sql.py b/python/datafusion/tests/test_sql.py index 8ec2ffb12..d85f380e7 100644 --- a/python/datafusion/tests/test_sql.py +++ b/python/datafusion/tests/test_sql.py @@ -19,6 +19,7 @@ import numpy as np import pyarrow as pa +from pyarrow.csv import write_csv import pyarrow.dataset as ds import pytest from datafusion.object_store import LocalFileSystem @@ -45,7 +46,7 @@ def test_register_csv(ctx, tmp_path): ], names=["int", "str", "float"], ) - pa.csv.write_csv(table, path) + write_csv(table, path) with open(path, "rb") as csv_file: with gzip.open(gzip_path, "wb") as gzipped_file: @@ -76,7 +77,13 @@ def test_register_csv(ctx, tmp_path): ) ctx.register_csv("csv3", path, schema=alternative_schema) - assert ctx.tables() == {"csv", "csv1", "csv2", "csv3", "csv_gzip"} + assert ctx.catalog().database().names() == { + "csv", + "csv1", + "csv2", + "csv3", + "csv_gzip", + } for table in ["csv", "csv1", "csv2", "csv_gzip"]: result = ctx.sql(f"SELECT COUNT(int) AS cnt FROM {table}").collect() @@ -100,14 +107,16 @@ def test_register_csv(ctx, tmp_path): def test_register_parquet(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) ctx.register_parquet("t", path) - assert ctx.tables() == {"t"} + ctx.register_parquet("t1", str(path)) + assert ctx.catalog().database().names() == {"t", "t1"} result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() result = pa.Table.from_batches(result) assert result.to_pydict() == {"cnt": [100]} -def test_register_parquet_partitioned(ctx, tmp_path): +@pytest.mark.parametrize("path_to_str", (True, False)) +def test_register_parquet_partitioned(ctx, tmp_path, path_to_str): dir_root = tmp_path / "dataset_parquet_partitioned" dir_root.mkdir(exist_ok=False) (dir_root / "grp=a").mkdir(exist_ok=False) @@ -124,14 +133,16 @@ def test_register_parquet_partitioned(ctx, tmp_path): pa.parquet.write_table(table.slice(0, 3), dir_root / "grp=a/file.parquet") pa.parquet.write_table(table.slice(3, 4), dir_root / "grp=b/file.parquet") + dir_root = str(dir_root) if path_to_str else dir_root + ctx.register_parquet( "datapp", - str(dir_root), + dir_root, table_partition_cols=[("grp", "string")], parquet_pruning=True, file_extension=".parquet", ) - assert ctx.tables() == {"datapp"} + assert ctx.catalog().database().names() == {"datapp"} result = ctx.sql("SELECT grp, COUNT(*) AS cnt FROM datapp GROUP BY grp").collect() result = pa.Table.from_batches(result) @@ -140,12 +151,14 @@ def test_register_parquet_partitioned(ctx, tmp_path): assert dict(zip(rd["grp"], rd["cnt"])) == {"a": 3, "b": 1} -def test_register_dataset(ctx, tmp_path): +@pytest.mark.parametrize("path_to_str", (True, False)) +def test_register_dataset(ctx, tmp_path, path_to_str): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) + path = str(path) if path_to_str else path dataset = ds.dataset(path, format="parquet") ctx.register_dataset("t", dataset) - assert ctx.tables() == {"t"} + assert ctx.catalog().database().names() == {"t"} result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() result = pa.Table.from_batches(result) @@ -174,6 +187,12 @@ def test_register_json(ctx, tmp_path): file_extension="gz", file_compression_type="gzip", ) + ctx.register_json( + "json_gzip1", + str(gzip_path), + file_extension="gz", + file_compression_type="gzip", + ) alternative_schema = pa.schema( [ @@ -184,7 +203,14 @@ def test_register_json(ctx, tmp_path): ) ctx.register_json("json3", path, schema=alternative_schema) - assert ctx.tables() == {"json", "json1", "json2", "json3", "json_gzip"} + assert ctx.catalog().database().names() == { + "json", + "json1", + "json2", + "json3", + "json_gzip", + "json_gzip1", + } for table in ["json", "json1", "json2", "json_gzip"]: result = ctx.sql(f'SELECT COUNT("B") AS cnt FROM {table}').collect() @@ -234,7 +260,7 @@ def test_execute(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", pa.array(data)) ctx.register_parquet("t", path) - assert ctx.tables() == {"t"} + assert ctx.catalog().database().names() == {"t"} # count result = ctx.sql("SELECT COUNT(a) AS cnt FROM t WHERE a IS NOT NULL").collect() @@ -280,9 +306,7 @@ def test_execute(ctx, tmp_path): def test_cast(ctx, tmp_path): - """ - Verify that we can cast - """ + """Verify that we can cast""" path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) ctx.register_parquet("t", path) @@ -379,7 +403,10 @@ def test_simple_select(ctx, tmp_path, arr): @pytest.mark.parametrize("file_sort_order", (None, [[col("int").sort(True, True)]])) @pytest.mark.parametrize("pass_schema", (True, False)) -def test_register_listing_table(ctx, tmp_path, pass_schema, file_sort_order): +@pytest.mark.parametrize("path_to_str", (True, False)) +def test_register_listing_table( + ctx, tmp_path, pass_schema, file_sort_order, path_to_str +): dir_root = tmp_path / "dataset_parquet_partitioned" dir_root.mkdir(exist_ok=False) (dir_root / "grp=a/date_id=20201005").mkdir(exist_ok=False, parents=True) @@ -404,16 +431,18 @@ def test_register_listing_table(ctx, tmp_path, pass_schema, file_sort_order): table.slice(5, 10), dir_root / "grp=b/date_id=20201005/file.parquet" ) + dir_root = f"file://{dir_root}/" if path_to_str else dir_root + ctx.register_object_store("file://local", LocalFileSystem(), None) ctx.register_listing_table( "my_table", - f"file://{dir_root}/", + dir_root, table_partition_cols=[("grp", "string"), ("date_id", "int")], file_extension=".parquet", schema=table.schema if pass_schema else None, file_sort_order=file_sort_order, ) - assert ctx.tables() == {"my_table"} + assert ctx.catalog().database().names() == {"my_table"} result = ctx.sql( "SELECT grp, COUNT(*) AS count FROM my_table GROUP BY grp" diff --git a/python/datafusion/tests/test_substrait.py b/python/datafusion/tests/test_substrait.py index 62f6413a3..2071c8f3b 100644 --- a/python/datafusion/tests/test_substrait.py +++ b/python/datafusion/tests/test_substrait.py @@ -35,17 +35,43 @@ def test_substrait_serialization(ctx): ctx.register_record_batches("t", [[batch]]) - assert ctx.tables() == {"t"} + assert ctx.catalog().database().names() == {"t"} # For now just make sure the method calls blow up - substrait_plan = ss.substrait.serde.serialize_to_plan("SELECT * FROM t", ctx) + substrait_plan = ss.Serde.serialize_to_plan("SELECT * FROM t", ctx) substrait_bytes = substrait_plan.encode() assert isinstance(substrait_bytes, bytes) - substrait_bytes = ss.substrait.serde.serialize_bytes("SELECT * FROM t", ctx) - substrait_plan = ss.substrait.serde.deserialize_bytes(substrait_bytes) - logical_plan = ss.substrait.consumer.from_substrait_plan(ctx, substrait_plan) + substrait_bytes = ss.Serde.serialize_bytes("SELECT * FROM t", ctx) + substrait_plan = ss.Serde.deserialize_bytes(substrait_bytes) + logical_plan = ss.Consumer.from_substrait_plan(ctx, substrait_plan) # demonstrate how to create a DataFrame from a deserialized logical plan df = ctx.create_dataframe_from_logical_plan(logical_plan) - substrait_plan = ss.substrait.producer.to_substrait_plan(df.logical_plan(), ctx) + substrait_plan = ss.Producer.to_substrait_plan(df.logical_plan(), ctx) + + +@pytest.mark.parametrize("path_to_str", (True, False)) +def test_substrait_file_serialization(ctx, tmp_path, path_to_str): + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + + ctx.register_record_batches("t", [[batch]]) + + assert ctx.catalog().database().names() == {"t"} + + path = tmp_path / "substrait_plan" + path = str(path) if path_to_str else path + + sql_command = "SELECT * FROM T" + ss.Serde.serialize(sql_command, ctx, path) + + expected_plan = ss.Serde.serialize_to_plan(sql_command, ctx) + actual_plan = ss.Serde.deserialize(path) + + expected_logical_plan = ss.Consumer.from_substrait_plan(ctx, expected_plan) + expected_actual_plan = ss.Consumer.from_substrait_plan(ctx, actual_plan) + + assert str(expected_logical_plan) == str(expected_actual_plan) diff --git a/python/datafusion/tests/test_udaf.py b/python/datafusion/tests/test_udaf.py index c2b29d199..81194927c 100644 --- a/python/datafusion/tests/test_udaf.py +++ b/python/datafusion/tests/test_udaf.py @@ -25,9 +25,7 @@ class Summarize(Accumulator): - """ - Interface of a user-defined accumulation. - """ + """Interface of a user-defined accumulation.""" def __init__(self): self._sum = pa.scalar(0.0) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py new file mode 100644 index 000000000..4bfbabe69 --- /dev/null +++ b/python/datafusion/udf.py @@ -0,0 +1,248 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Provides the user defined functions for evaluation of dataframes.""" + +from __future__ import annotations + +import datafusion._internal as df_internal +from datafusion.expr import Expr +from typing import Callable, TYPE_CHECKING, TypeVar +from abc import ABCMeta, abstractmethod +from typing import List +from enum import Enum +import pyarrow + +if TYPE_CHECKING: + _R = TypeVar("_R", bound=pyarrow.DataType) + + +class Volatility(Enum): + """Defines how stable or volatile a function is. + + When setting the volatility of a function, you can either pass this + enumeration or a `str`. The `str` equivalent is the lower case value of the + name (`"immutable"`, `"stable"`, or `"volatile"`). + """ + + Immutable = 1 + """An immutable function will always return the same output when given the + same input. + + DataFusion will attempt to inline immutable functions during planning. + """ + + Stable = 2 + """ + Returns the same value for a given input within a single queries. + + A stable function may return different values given the same input across + different queries but must return the same value for a given input within a + query. An example of this is the `Now` function. DataFusion will attempt to + inline `Stable` functions during planning, when possible. For query + `select col1, now() from t1`, it might take a while to execute but `now()` + column will be the same for each output row, which is evaluated during + planning. + """ + + Volatile = 3 + """A volatile function may change the return value from evaluation to + evaluation. + + Multiple invocations of a volatile function may return different results + when used in the same query. An example of this is the random() function. + DataFusion can not evaluate such functions during planning. In the query + `select col1, random() from t1`, `random()` function will be evaluated + for each output row, resulting in a unique random value for each row. + """ + + def __str__(self): + """Returns the string equivalent.""" + return self.name.lower() + + +class ScalarUDF: + """Class for performing scalar user defined functions (UDF). + + Scalar UDFs operate on a row by row basis. See also ``AggregateUDF`` for + operating on a group of rows. + """ + + def __init__( + self, + name: str | None, + func: Callable[..., _R], + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + ) -> None: + """Instantiate a scalar user defined function (UDF). + + See helper method ``udf`` for argument details. + """ + self.udf = df_internal.ScalarUDF( + name, func, input_types, return_type, str(volatility) + ) + + def __call__(self, *args: Expr) -> Expr: + """Execute the UDF. + + This function is not typically called by an end user. These calls will + occur during the evaluation of the dataframe. + """ + args = [arg.expr for arg in args] + return Expr(self.udf.__call__(*args)) + + @staticmethod + def udf( + func: Callable[..., _R], + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + name: str | None = None, + ) -> ScalarUDF: + """Create a new User Defined Function. + + Args: + func: A callable python function. + input_types: The data types of the arguments to `func`. This list + must be of the same length as the number of arguments. + return_type: The data type of the return value from the python + function. + volatility: See ``Volatility`` for allowed values. + name: A descriptive name for the function. + + Returns: + A user defined aggregate function, which can be used in either data + aggregation or window function calls. + """ + if not callable(func): + raise TypeError("`func` argument must be callable") + if name is None: + name = func.__qualname__.lower() + return ScalarUDF( + name=name, + func=func, + input_types=input_types, + return_type=return_type, + volatility=volatility, + ) + + +class Accumulator(metaclass=ABCMeta): + """Defines how an `AggregateUDF` accumulates values during an evaluation.""" + + @abstractmethod + def state(self) -> List[pyarrow.Scalar]: + """Return the current state.""" + pass + + @abstractmethod + def update(self, values: pyarrow.Array) -> None: + """Evalute an array of values and update state.""" + pass + + @abstractmethod + def merge(self, states: pyarrow.Array) -> None: + """Merge a set of states.""" + pass + + @abstractmethod + def evaluate(self) -> pyarrow.Scalar: + """Return the resultant value.""" + pass + + +if TYPE_CHECKING: + _A = TypeVar("_A", bound=(Callable[..., _R], Accumulator)) + + +class AggregateUDF: + """Class for performing scalar user defined functions (UDF). + + Aggregate UDFs operate on a group of rows and return a single value. See + also ``ScalarUDF`` for operating on a row by row basis. + """ + + def __init__( + self, + name: str | None, + accumulator: _A, + input_types: list[pyarrow.DataType], + return_type: _R, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + ) -> None: + """Instantiate a user defined aggregate function (UDAF). + + See ``Aggregate::udaf`` for a convenience function and arugment + descriptions. + """ + self.udf = df_internal.AggregateUDF( + name, accumulator, input_types, return_type, state_type, str(volatility) + ) + + def __call__(self, *args: Expr) -> Expr: + """Execute the UDAF. + + This function is not typically called by an end user. These calls will + occur during the evaluation of the dataframe. + """ + args = [arg.expr for arg in args] + return Expr(self.udf.__call__(*args)) + + @staticmethod + def udaf( + accum: _A, + input_types: list[pyarrow.DataType], + return_type: _R, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + name: str | None = None, + ) -> AggregateUDF: + """Create a new User Defined Aggregate Function. + + The accumulator function must be callable and implement `Accumulator`. + + Args: + accum: The accumulator python function. + input_types: The data types of the arguments to `accum`. + return_type: The data type of the return value. + state_type: The data types of the intermediate accumulation. + volatility: See `Volatility` for allowed values. + name: A descriptive name for the function. + + Returns: + A user defined aggregate function, which can be used in either data + aggregation or window function calls. + """ + if not issubclass(accum, Accumulator): + raise TypeError( + "`accum` must implement the abstract base class Accumulator" + ) + if name is None: + name = accum.__qualname__.lower() + if isinstance(input_types, pyarrow.lib.DataType): + input_types = [input_types] + return AggregateUDF( + name=name, + accumulator=accum, + input_types=input_types, + return_type=return_type, + state_type=state_type, + volatility=volatility, + ) diff --git a/src/common.rs b/src/common.rs index 094e70c01..453bf67a4 100644 --- a/src/common.rs +++ b/src/common.rs @@ -27,6 +27,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/src/common/data_type.rs b/src/common/data_type.rs index 313318fc9..3299a46f7 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -764,7 +764,7 @@ pub enum SqlType { #[allow(non_camel_case_types)] #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "PythonType", module = "datafusion.common")] +#[pyclass(name = "NullTreatment", module = "datafusion.common")] pub enum NullTreatment { IGNORE_NULLS, RESPECT_NULLS, diff --git a/src/dataframe.rs b/src/dataframe.rs index 9e36be2c4..4db59d4fe 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; -use datafusion::config::TableParquetOptions; +use datafusion::config::{CsvOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; @@ -320,6 +320,18 @@ impl PyDataFrame { Ok(Self::new(df)) } + #[pyo3(signature = (columns, preserve_nulls=true))] + fn unnest_columns(&self, columns: Vec, preserve_nulls: bool) -> PyResult { + let unnest_options = UnnestOptions { preserve_nulls }; + let cols = columns.iter().map(|s| s.as_ref()).collect::>(); + let df = self + .df + .as_ref() + .clone() + .unnest_columns_with_options(&cols, unnest_options)?; + Ok(Self::new(df)) + } + /// Calculate the intersection of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema fn intersect(&self, py_df: PyDataFrame) -> PyResult { let new_df = self @@ -337,13 +349,18 @@ impl PyDataFrame { } /// Write a `DataFrame` to a CSV file. - fn write_csv(&self, path: &str, py: Python) -> PyResult<()> { + fn write_csv(&self, path: &str, with_header: bool, py: Python) -> PyResult<()> { + let csv_options = CsvOptions { + has_header: Some(with_header), + ..Default::default() + }; wait_for_future( py, - self.df - .as_ref() - .clone() - .write_csv(path, DataFrameWriteOptions::new(), None), + self.df.as_ref().clone().write_csv( + path, + DataFrameWriteOptions::new(), + Some(csv_options), + ), )?; Ok(()) } diff --git a/src/expr.rs b/src/expr.rs index dc1de669b..aab0daa6f 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -583,6 +583,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/src/functions.rs b/src/functions.rs index 0ec0d5ef1..1e75dd5fb 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -233,6 +233,12 @@ fn concat_ws(sep: String, args: Vec) -> PyResult { Ok(functions::string::expr_fn::concat_ws(lit(sep), args).into()) } +#[pyfunction] +#[pyo3(signature = (values, regex, flags = None))] +fn regexp_like(values: PyExpr, regex: PyExpr, flags: Option) -> PyResult { + Ok(functions::expr_fn::regexp_like(values.expr, regex.expr, flags.map(|x| x.expr)).into()) +} + #[pyfunction] #[pyo3(signature = (values, regex, flags = None))] fn regexp_match(values: PyExpr, regex: PyExpr, flags: Option) -> PyResult { @@ -257,12 +263,12 @@ fn regexp_replace( } /// Creates a new Sort Expr #[pyfunction] -fn order_by(expr: PyExpr, asc: Option, nulls_first: Option) -> PyResult { +fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { Ok(PyExpr { expr: datafusion_expr::Expr::Sort(Sort { expr: Box::new(expr.expr), - asc: asc.unwrap_or(true), - nulls_first: nulls_first.unwrap_or(true), + asc, + nulls_first, }), }) } @@ -516,6 +522,7 @@ expr_fn!(chr, arg, "Returns the character with the given code."); expr_fn_vec!(coalesce); expr_fn!(cos, num); expr_fn!(cosh, num); +expr_fn!(cot, num); expr_fn!(degrees, num); expr_fn!(decode, input encoding); expr_fn!(encode, input encoding); @@ -527,6 +534,7 @@ expr_fn!(gcd, x y); expr_fn!(initcap, string, "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters."); expr_fn!(isnan, num); expr_fn!(iszero, num); +expr_fn!(levenshtein, string1 string2); expr_fn!(lcm, x y); expr_fn!(left, string n, "Returns first n characters in the string, or when n is negative, returns all but last |n| characters."); expr_fn!(ln, num); @@ -548,6 +556,7 @@ expr_fn!( ); expr_fn!(nullif, arg_1 arg_2); expr_fn!(octet_length, args, "Returns number of bytes in the string. Since this version of the function accepts type character directly, it will not strip trailing spaces."); +expr_fn_vec!(overlay); expr_fn!(pi); expr_fn!(power, base exponent); expr_fn!(pow, power, base exponent); @@ -583,7 +592,9 @@ expr_fn!(sqrt, num); expr_fn!(starts_with, string prefix, "Returns true if string starts with prefix."); expr_fn!(strpos, string substring, "Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)"); expr_fn!(substr, string position); +expr_fn!(substr_index, string delimiter count); expr_fn!(substring, string position length); +expr_fn!(find_in_set, string string_list); expr_fn!(tan, num); expr_fn!(tanh, num); expr_fn!( @@ -596,6 +607,7 @@ expr_fn_vec!(to_timestamp); expr_fn_vec!(to_timestamp_millis); expr_fn_vec!(to_timestamp_micros); expr_fn_vec!(to_timestamp_seconds); +expr_fn_vec!(to_unixtime); expr_fn!(current_date); expr_fn!(current_time); expr_fn!(date_part, part date); @@ -603,6 +615,7 @@ expr_fn!(datepart, date_part, part date); expr_fn!(date_trunc, part date); expr_fn!(datetrunc, date_trunc, part date); expr_fn!(date_bin, stride source origin); +expr_fn!(make_date, year month day); expr_fn!(translate, string from to, "Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted."); expr_fn_vec!(trim, "Removes the longest string containing only characters in characters (a space by default) from the start, end, or both ends (BOTH is the default) of string."); @@ -740,6 +753,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(corr))?; m.add_wrapped(wrap_pyfunction!(cos))?; m.add_wrapped(wrap_pyfunction!(cosh))?; + m.add_wrapped(wrap_pyfunction!(cot))?; m.add_wrapped(wrap_pyfunction!(count))?; m.add_wrapped(wrap_pyfunction!(count_star))?; m.add_wrapped(wrap_pyfunction!(covar))?; @@ -753,6 +767,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(date_part))?; m.add_wrapped(wrap_pyfunction!(datetrunc))?; m.add_wrapped(wrap_pyfunction!(date_trunc))?; + m.add_wrapped(wrap_pyfunction!(make_date))?; m.add_wrapped(wrap_pyfunction!(digest))?; m.add_wrapped(wrap_pyfunction!(ends_with))?; m.add_wrapped(wrap_pyfunction!(exp))?; @@ -765,6 +780,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(initcap))?; m.add_wrapped(wrap_pyfunction!(isnan))?; m.add_wrapped(wrap_pyfunction!(iszero))?; + m.add_wrapped(wrap_pyfunction!(levenshtein))?; m.add_wrapped(wrap_pyfunction!(lcm))?; m.add_wrapped(wrap_pyfunction!(left))?; m.add_wrapped(wrap_pyfunction!(length))?; @@ -787,11 +803,13 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(nullif))?; m.add_wrapped(wrap_pyfunction!(octet_length))?; m.add_wrapped(wrap_pyfunction!(order_by))?; + m.add_wrapped(wrap_pyfunction!(overlay))?; m.add_wrapped(wrap_pyfunction!(pi))?; m.add_wrapped(wrap_pyfunction!(power))?; m.add_wrapped(wrap_pyfunction!(pow))?; m.add_wrapped(wrap_pyfunction!(radians))?; m.add_wrapped(wrap_pyfunction!(random))?; + m.add_wrapped(wrap_pyfunction!(regexp_like))?; m.add_wrapped(wrap_pyfunction!(regexp_match))?; m.add_wrapped(wrap_pyfunction!(regexp_replace))?; m.add_wrapped(wrap_pyfunction!(repeat))?; @@ -817,7 +835,9 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(strpos))?; m.add_wrapped(wrap_pyfunction!(r#struct))?; // Use raw identifier since struct is a keyword m.add_wrapped(wrap_pyfunction!(substr))?; + m.add_wrapped(wrap_pyfunction!(substr_index))?; m.add_wrapped(wrap_pyfunction!(substring))?; + m.add_wrapped(wrap_pyfunction!(find_in_set))?; m.add_wrapped(wrap_pyfunction!(sum))?; m.add_wrapped(wrap_pyfunction!(tan))?; m.add_wrapped(wrap_pyfunction!(tanh))?; @@ -826,6 +846,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(to_timestamp_millis))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_micros))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_seconds))?; + m.add_wrapped(wrap_pyfunction!(to_unixtime))?; m.add_wrapped(wrap_pyfunction!(translate))?; m.add_wrapped(wrap_pyfunction!(trim))?; m.add_wrapped(wrap_pyfunction!(trunc))?; diff --git a/src/lib.rs b/src/lib.rs index 71c27e1ac..357eaacd9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -92,6 +92,8 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; // Register `common` as a submodule. Matching `datafusion-common` https://docs.rs/datafusion-common/latest/datafusion_common/ let common = PyModule::new_bound(py, "common")?; diff --git a/src/substrait.rs b/src/substrait.rs index 1e9e16c7b..60a523800 100644 --- a/src/substrait.rs +++ b/src/substrait.rs @@ -27,7 +27,7 @@ use datafusion_substrait::serializer; use datafusion_substrait::substrait::proto::Plan; use prost::Message; -#[pyclass(name = "plan", module = "datafusion.substrait", subclass)] +#[pyclass(name = "Plan", module = "datafusion.substrait", subclass)] #[derive(Debug, Clone)] pub struct PyPlan { pub plan: Plan, From f00b8ee1959f354950660b0b0c784d4f48f420c6 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 20 Jul 2024 11:21:46 -0400 Subject: [PATCH 002/248] Expose array sort (#764) --- python/datafusion/functions.py | 22 ++++++++++++++++++++++ python/datafusion/tests/test_functions.py | 8 ++++++++ src/functions.rs | 4 ++++ 3 files changed, 34 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index ad77712ed..46d2a2f04 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1126,6 +1126,28 @@ def list_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: return array_replace_all(array, from_val, to_val) +def array_sort(array: Expr, descending: bool = False, null_first: bool = False) -> Expr: + """Sort an array. + + Args: + array: The input array to sort. + descending: If True, sorts in descending order. + null_first: If True, nulls will be returned at the beginning of the array. + """ + desc = "DESC" if descending else "ASC" + nulls_first = "NULLS FIRST" if null_first else "NULLS LAST" + return Expr( + f.array_sort( + array.expr, Expr.literal(desc).expr, Expr.literal(nulls_first).expr + ) + ) + + +def list_sort(array: Expr, descending: bool = False, null_first: bool = False) -> Expr: + """This is an alias for ``array_sort``.""" + return array_sort(array, descending=descending, null_first=null_first) + + def array_slice( array: Expr, begin: Expr, end: Expr, stride: Expr | None = None ) -> Expr: diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index 2384b6ab8..25d7de14a 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -453,6 +453,14 @@ def py_flatten(arr): lambda col: f.list_replace_all(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0) for arr in data], ], + [ + lambda col: f.array_sort(col, descending=True, null_first=True), + lambda data: [np.sort(arr)[::-1] for arr in data], + ], + [ + lambda col: f.list_sort(col, descending=False, null_first=False), + lambda data: [np.sort(arr) for arr in data], + ], [ lambda col: f.array_slice(col, literal(2), literal(4)), lambda data: [arr[1:4] for arr in data], diff --git a/src/functions.rs b/src/functions.rs index 1e75dd5fb..74eb48a62 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -673,6 +673,8 @@ array_fn!(array_replace_n, array from to max); array_fn!(list_replace_n, array_replace_n, array from to max); array_fn!(array_replace_all, array from to); array_fn!(list_replace_all, array_replace_all, array from to); +array_fn!(array_sort, array desc null_first); +array_fn!(list_sort, array_sort, array desc null_first); array_fn!(array_intersect, first_array second_array); array_fn!(list_intersect, array_intersect, first_array second_array); array_fn!(array_union, array1 array2); @@ -936,6 +938,8 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(list_replace_n))?; m.add_wrapped(wrap_pyfunction!(array_replace_all))?; m.add_wrapped(wrap_pyfunction!(list_replace_all))?; + m.add_wrapped(wrap_pyfunction!(array_sort))?; + m.add_wrapped(wrap_pyfunction!(list_sort))?; m.add_wrapped(wrap_pyfunction!(array_slice))?; m.add_wrapped(wrap_pyfunction!(list_slice))?; m.add_wrapped(wrap_pyfunction!(flatten))?; From fd6b4df6d19a2176b3cd84ecf27004b5ba301d36 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Thu, 25 Jul 2024 09:14:09 -0500 Subject: [PATCH 003/248] Upgrade protobuf and remove GH Action googletest-installer (#773) * update deps to force new protobuf-src The new version **should** remove the need for the blocked GH action. Ref: https://github.com/MaterializeInc/rust-protobuf-native/issues/20 Ref: https://github.com/apache/datafusion-python/issues/763 * remove googletest-installer github action Fixes https://github.com/apache/datafusion-python/issues/763 * remove needless Clone of TimeUnit type which is Copy --- .github/workflows/build.yml | 4 - .github/workflows/test.yaml | 4 - Cargo.lock | 516 ++++++++++++++++++++---------------- src/common/data_type.rs | 8 +- 4 files changed, 293 insertions(+), 239 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a37abe53a..350be46d5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -89,10 +89,6 @@ jobs: name: python-wheel-license path: . - # To remove once https://github.com/MaterializeInc/rust-protobuf-native/issues/20 is resolved - - name: Install gtest - uses: MarkusJx/googletest-installer@v1.1 - - name: Install Protoc uses: arduino/setup-protoc@v1 with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c9a365bbb..4f47dc984 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -55,10 +55,6 @@ jobs: version: '3.20.2' repo-token: ${{ secrets.GITHUB_TOKEN }} - # To remove once https://github.com/MaterializeInc/rust-protobuf-native/issues/20 is resolved - - name: Install gtest - uses: MarkusJx/googletest-installer@v1.1 - - name: Setup Python uses: actions/setup-python@v5 with: diff --git a/Cargo.lock b/Cargo.lock index 3c282e071..0979306ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -118,9 +118,9 @@ dependencies = [ [[package]] name = "arrayref" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" +checksum = "9d151e35f61089500b617991b791fc8bfd237ae50cd5950803758a179b41e67a" [[package]] name = "arrayvec" @@ -130,9 +130,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ae9728f104939be6d8d9b368a354b4929b0569160ea1641f0721b55a861ce38" +checksum = "6127ea5e585a12ec9f742232442828ebaf264dfa5eefdd71282376c599562b77" dependencies = [ "arrow-arith", "arrow-array", @@ -152,9 +152,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7029a5b3efbeafbf4a12d12dc16b8f9e9bff20a410b8c25c5d28acc089e1043" +checksum = "7add7f39210b7d726e2a8efc0083e7bf06e8f2d15bdb4896b564dce4410fbf5d" dependencies = [ "arrow-array", "arrow-buffer", @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d33238427c60271710695f17742f45b1a5dc5bcfc5c15331c25ddfe7abf70d97" +checksum = "81c16ec702d3898c2f5cfdc148443c6cd7dbe5bac28399859eb0a3d38f072827" dependencies = [ "ahash", "arrow-buffer", @@ -184,9 +184,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe9b95e825ae838efaf77e366c00d3fc8cca78134c9db497d6bda425f2e7b7c1" +checksum = "cae6970bab043c4fbc10aee1660ceb5b306d0c42c8cc5f6ae564efcd9759b663" dependencies = [ "bytes", "half", @@ -195,9 +195,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cf8385a9d5b5fcde771661dd07652b79b9139fea66193eda6a88664400ccab" +checksum = "1c7ef44f26ef4f8edc392a048324ed5d757ad09135eff6d5509e6450d39e0398" dependencies = [ "arrow-array", "arrow-buffer", @@ -216,9 +216,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea5068bef430a86690059665e40034625ec323ffa4dd21972048eebb0127adc" +checksum = "5f843490bd258c5182b66e888161bb6f198f49f3792f7c7f98198b924ae0f564" dependencies = [ "arrow-array", "arrow-buffer", @@ -235,9 +235,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb29be98f987bcf217b070512bb7afba2f65180858bca462edf4a39d84a23e10" +checksum = "a769666ffac256dd301006faca1ca553d0ae7cffcf4cd07095f73f95eb226514" dependencies = [ "arrow-buffer", "arrow-schema", @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffc68f6523970aa6f7ce1dc9a33a7d9284cfb9af77d4ad3e617dbe5d79cc6ec8" +checksum = "dbf9c3fb57390a1af0b7bb3b5558c1ee1f63905f3eccf49ae7676a8d1e6e5a72" dependencies = [ "arrow-array", "arrow-buffer", @@ -262,9 +262,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2041380f94bd6437ab648e6c2085a045e45a0c44f91a1b9a4fe3fed3d379bfb1" +checksum = "654e7f3724176b66ddfacba31af397c48e106fbe4d281c8144e7d237df5acfd7" dependencies = [ "arrow-array", "arrow-buffer", @@ -282,9 +282,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb56ed1547004e12203652f12fe12e824161ff9d1e5cf2a7dc4ff02ba94f413" +checksum = "e8008370e624e8e3c68174faaf793540287106cfda8ad1da862fdc53d8e096b4" dependencies = [ "arrow-array", "arrow-buffer", @@ -297,9 +297,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "575b42f1fc588f2da6977b94a5ca565459f5ab07b60545e17243fb9a7ed6d43e" +checksum = "ca5e3a6b7fda8d9fe03f3b18a2d946354ea7f3c8e4076dbdb502ad50d9d44824" dependencies = [ "ahash", "arrow-array", @@ -312,18 +312,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32aae6a60458a2389c0da89c9de0b7932427776127da1a738e2efc21d32f3393" +checksum = "dab1c12b40e29d9f3b699e0203c2a73ba558444c05e388a4377208f8f9c97eee" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", ] [[package]] name = "arrow-select" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de36abaef8767b4220d7b4a8c2fe5ffc78b47db81b03d77e2136091c3ba39102" +checksum = "e80159088ffe8c48965cb9b1a7c968b2729f29f37363df7eca177fc3281fe7c3" dependencies = [ "ahash", "arrow-array", @@ -335,9 +335,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e435ada8409bcafc910bc3e0077f532a4daa20e99060a496685c0e3e53cc2597" +checksum = "0fd04a6ea7de183648edbcb7a6dd925bbd04c210895f6384c780e27a9b54afcd" dependencies = [ "arrow-array", "arrow-buffer", @@ -352,9 +352,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd066d0b4ef8ecb03a55319dc13aa6910616d0f44008a045bb1835af830abff5" +checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" dependencies = [ "bzip2", "flate2", @@ -364,8 +364,8 @@ dependencies = [ "pin-project-lite", "tokio", "xz2", - "zstd 0.13.0", - "zstd-safe 7.0.0", + "zstd 0.13.2", + "zstd-safe 7.2.0", ] [[package]] @@ -376,18 +376,18 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] name = "async-trait" -version = "0.1.80" +version = "0.1.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" +checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -413,9 +413,9 @@ checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "backtrace" -version = "0.3.72" +version = "0.3.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17c6a35df3749d2e8bb1b7b21a976d82b15548788d2735b9d82f329268f71a11" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" dependencies = [ "addr2line", "cc", @@ -446,9 +446,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "blake2" @@ -461,9 +461,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.1" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30cca6d3674597c30ddf2c587bf8d9d65c9a84d2326d941cc79c9842dfe0ef52" +checksum = "e9ec96fe9a81b5e365f9db71fe00edc4fe4ca2cc7dcb7861f0603012a7caa210" dependencies = [ "arrayref", "arrayvec", @@ -516,9 +516,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "a12916984aab3fa6e39d655a33e09c0071eb36d6ab3aea5c2d78551f1df6d952" [[package]] name = "bzip2" @@ -543,13 +543,12 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.98" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f" +checksum = "2aba8f4e9906c7ce3c73463f62a7f0c65183ada1a2d47e397cc8810827f9694f" dependencies = [ "jobserver", "libc", - "once_cell", ] [[package]] @@ -568,7 +567,7 @@ dependencies = [ "iana-time-zone", "num-traits", "serde", - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] @@ -608,7 +607,7 @@ version = "7.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" dependencies = [ - "strum 0.26.2", + "strum 0.26.3", "strum_macros 0.26.4", "unicode-width", ] @@ -791,7 +790,7 @@ dependencies = [ "url", "uuid", "xz2", - "zstd 0.13.0", + "zstd 0.13.2", ] [[package]] @@ -863,7 +862,7 @@ dependencies = [ "paste", "serde_json", "sqlparser", - "strum 0.26.2", + "strum 0.26.3", "strum_macros 0.26.4", ] @@ -1052,7 +1051,7 @@ dependencies = [ "rand", "regex-syntax", "sqlparser", - "syn 2.0.68", + "syn 2.0.72", "tokio", "url", "uuid", @@ -1072,7 +1071,7 @@ dependencies = [ "log", "regex", "sqlparser", - "strum 0.26.2", + "strum 0.26.3", ] [[package]] @@ -1117,9 +1116,9 @@ checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" [[package]] name = "either" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "equivalent" @@ -1240,7 +1239,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -1383,9 +1382,9 @@ dependencies = [ [[package]] name = "http-body" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", "http", @@ -1393,12 +1392,12 @@ dependencies = [ [[package]] name = "http-body-util" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", - "futures-core", + "futures-util", "http", "http-body", "pin-project-lite", @@ -1406,9 +1405,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.8.0" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" [[package]] name = "humantime" @@ -1418,9 +1417,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.3.1" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d" +checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" dependencies = [ "bytes", "futures-channel", @@ -1438,15 +1437,16 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.26.0" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" +checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" dependencies = [ "futures-util", "http", "hyper", "hyper-util", "rustls", + "rustls-native-certs", "rustls-pki-types", "tokio", "tokio-rustls", @@ -1455,9 +1455,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b875924a60b96e5d7b9ae7b066540b1dd1cbd90d1828f54c92e02a283351c56" +checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956" dependencies = [ "bytes", "futures-channel", @@ -1564,6 +1564,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -1572,9 +1581,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] @@ -1590,9 +1599,9 @@ dependencies = [ [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" @@ -1722,9 +1731,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.21" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lz4_flex" @@ -1758,9 +1767,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memoffset" @@ -1788,22 +1797,23 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.11" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +checksum = "4569e456d394deccd22ce1c1913e6ea0e54519f577285001215d33557431afe4" dependencies = [ + "hermit-abi", "libc", "wasi", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -1828,9 +1838,9 @@ dependencies = [ [[package]] name = "num-bigint" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c165a9ab64cf766f73521c0dd2cfdff64f488b8f0b3e621face3462d3db536d7" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", @@ -1898,18 +1908,18 @@ dependencies = [ [[package]] name = "object" -version = "0.35.0" +version = "0.36.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ec7ab813848ba4522158d5517a6093db1ded27575b070f4177b8d12b41db5e" +checksum = "3f203fa8daa7bb185f760ae12bd8e097f63d17041dcdcaf675ac54cdf863170e" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.10.1" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbebfd32c213ba1907fa7a9c9138015a8de2b43e30c5aa45b18f7deb46786ad6" +checksum = "e6da452820c715ce78221e8202ccc599b4a52f3e1eb3eedb487b680c81a8e3f3" dependencies = [ "async-trait", "base64 0.22.1", @@ -1918,7 +1928,7 @@ dependencies = [ "futures", "humantime", "hyper", - "itertools 0.12.1", + "itertools 0.13.0", "md-5", "parking_lot", "percent-encoding", @@ -1977,14 +1987,14 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] name = "parquet" -version = "52.0.0" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c3b5322cc1bbf67f11c079c42be41a55949099b78732f7dba9e15edde40eab" +checksum = "0f22ba0d95db56dde8685e3fadcb915cdaadda31ab8abbe3ff7f0ad1ef333267" dependencies = [ "ahash", "arrow-array", @@ -2012,7 +2022,7 @@ dependencies = [ "thrift", "tokio", "twox-hash", - "zstd 0.13.0", + "zstd 0.13.2", "zstd-sys", ] @@ -2139,7 +2149,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2162,9 +2172,9 @@ checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "portable-atomic" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" +checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" [[package]] name = "ppv-lite86" @@ -2179,14 +2189,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" dependencies = [ "proc-macro2", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] name = "proc-macro2" -version = "1.0.85" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] @@ -2218,7 +2228,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.68", + "syn 2.0.72", "tempfile", ] @@ -2232,7 +2242,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2246,9 +2256,9 @@ dependencies = [ [[package]] name = "protobuf-src" -version = "2.0.1+26.1" +version = "2.1.0+27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ba1cfa4b9dc098926b8cce388bf434b93516db3ecf6e8b1a37eb643d733ee7" +checksum = "a7edafa3bcc668fa93efafcbdf58d7821bbda0f4b458ac7fae3d57ec0fec8167" dependencies = [ "cmake", ] @@ -2300,7 +2310,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2313,7 +2323,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2324,14 +2334,60 @@ checksum = "658fa1faf7a4cc5f057c9ee5ef560f717ad9d8dc66d975267f709624d6e1ab88" [[package]] name = "quick-xml" -version = "0.31.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +checksum = "96a05e2e8efddfa51a84ca47cec303fac86c8541b686d37cac5efc0e094417bc" dependencies = [ "memchr", "serde", ] +[[package]] +name = "quinn" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4ceeeeabace7857413798eb1ffa1e9c905a9946a57d81fb69b4b71c4d8eb3ad" +dependencies = [ + "bytes", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "thiserror", + "tokio", + "tracing", +] + +[[package]] +name = "quinn-proto" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe" +dependencies = [ + "bytes", + "rand", + "ring", + "rustc-hash", + "rustls", + "slab", + "thiserror", + "tinyvec", + "tracing", +] + +[[package]] +name = "quinn-udp" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" +dependencies = [ + "libc", + "once_cell", + "socket2", + "windows-sys 0.52.0", +] + [[package]] name = "quote" version = "1.0.36" @@ -2373,18 +2429,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.1" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" +checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", ] [[package]] name = "regex" -version = "1.10.4" +version = "1.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" +checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" dependencies = [ "aho-corasick", "memchr", @@ -2394,9 +2450,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" dependencies = [ "aho-corasick", "memchr", @@ -2405,15 +2461,15 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" +checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "regress" @@ -2427,9 +2483,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" +checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" dependencies = [ "base64 0.22.1", "bytes", @@ -2449,6 +2505,7 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", + "quinn", "rustls", "rustls-native-certs", "rustls-pemfile", @@ -2496,6 +2553,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc_version" version = "0.4.0" @@ -2511,7 +2574,7 @@ version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", @@ -2520,11 +2583,11 @@ dependencies = [ [[package]] name = "rustls" -version = "0.22.4" +version = "0.23.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" +checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044" dependencies = [ - "log", + "once_cell", "ring", "rustls-pki-types", "rustls-webpki", @@ -2534,9 +2597,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792" +checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" dependencies = [ "openssl-probe", "rustls-pemfile", @@ -2563,9 +2626,9 @@ checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" [[package]] name = "rustls-webpki" -version = "0.102.4" +version = "0.102.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff448f7e92e913c4b7d4c6d8e4540a1724b319b4152b8aef6d4cf8339712b33e" +checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" dependencies = [ "ring", "rustls-pki-types", @@ -2623,7 +2686,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2634,11 +2697,11 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "2.11.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", "core-foundation", "core-foundation-sys", "libc", @@ -2647,9 +2710,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.11.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" +checksum = "75da29fe9b9b08fe9d6b22b5b4bcbc75d8db3aa31e639aa56bb62e9d46bfceaf" dependencies = [ "core-foundation-sys", "libc", @@ -2672,22 +2735,22 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.203" +version = "1.0.204" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.203" +version = "1.0.204" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2698,14 +2761,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] name = "serde_json" -version = "1.0.117" +version = "1.0.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" +checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" dependencies = [ "itoa", "ryu", @@ -2714,14 +2777,14 @@ dependencies = [ [[package]] name = "serde_tokenstream" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a00ffd23fd882d096f09fcaae2a9de8329a328628e86027e049ee051dc1621f" +checksum = "8790a7c3fe883e443eaa2af6f705952bc5d6e8671a220b9335c8cae92c037e74" dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2843,7 +2906,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2860,9 +2923,9 @@ checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" [[package]] name = "strum" -version = "0.26.2" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" dependencies = [ "strum_macros 0.26.4", ] @@ -2877,7 +2940,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2890,7 +2953,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -2913,16 +2976,16 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.68", + "syn 2.0.72", "typify", "walkdir", ] [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" @@ -2937,9 +3000,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.68" +version = "2.0.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" +checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" dependencies = [ "proc-macro2", "quote", @@ -2948,15 +3011,15 @@ dependencies = [ [[package]] name = "sync_wrapper" -version = "0.1.2" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" [[package]] name = "target-lexicon" -version = "0.12.14" +version = "0.12.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" +checksum = "4873307b7c257eddcb50c9bedf158eb669578359fb28428bef438fec8e6ba7c2" [[package]] name = "tempfile" @@ -2972,22 +3035,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.61" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.61" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -3012,9 +3075,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" dependencies = [ "tinyvec_macros", ] @@ -3027,37 +3090,36 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.0" +version = "1.39.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +checksum = "d040ac2b29ab03b09d4129c2f5bbd012a3ac2f79d38ff506a4bf8dd34b0eac8a" dependencies = [ "backtrace", "bytes", "libc", "mio", - "num_cpus", "pin-project-lite", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] name = "tokio-rustls" -version = "0.25.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f" +checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ "rustls", "rustls-pki-types", @@ -3123,7 +3185,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -3168,7 +3230,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -3202,7 +3264,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.68", + "syn 2.0.72", "thiserror", "unicode-ident", ] @@ -3220,7 +3282,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.68", + "syn 2.0.72", "typify-impl", ] @@ -3277,9 +3339,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.0" +version = "2.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" dependencies = [ "form_urlencoded", "idna", @@ -3288,9 +3350,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ "getrandom", "serde", @@ -3348,7 +3410,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", "wasm-bindgen-shared", ] @@ -3382,7 +3444,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3431,7 +3493,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] @@ -3449,7 +3511,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] @@ -3469,18 +3531,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.5", - "windows_aarch64_msvc 0.52.5", - "windows_i686_gnu 0.52.5", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc 0.52.5", - "windows_x86_64_gnu 0.52.5", - "windows_x86_64_gnullvm 0.52.5", - "windows_x86_64_msvc 0.52.5", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -3491,9 +3553,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" @@ -3503,9 +3565,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" @@ -3515,15 +3577,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" @@ -3533,9 +3595,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" @@ -3545,9 +3607,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" @@ -3557,9 +3619,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" @@ -3569,9 +3631,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winreg" @@ -3594,22 +3656,22 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.7.34" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae87e3fcd617500e5d106f0380cf7b77f3c6092aae37191433159dda23cfb087" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.34" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.72", ] [[package]] @@ -3629,11 +3691,11 @@ dependencies = [ [[package]] name = "zstd" -version = "0.13.0" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ - "zstd-safe 7.0.0", + "zstd-safe 7.2.0", ] [[package]] @@ -3648,18 +3710,18 @@ dependencies = [ [[package]] name = "zstd-safe" -version = "7.0.0" +version = "7.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e" +checksum = "fa556e971e7b568dc775c136fc9de8c779b1c2fc3a63defaafadffdbd3181afa" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.9+zstd.1.5.5" +version = "2.0.11+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" +checksum = "75652c55c0b6f3e6f12eb786fe1bc960396bf05a1eb3bf1f3691c3610ac2e6d4" dependencies = [ "cc", "pkg-config", diff --git a/src/common/data_type.rs b/src/common/data_type.rs index 3299a46f7..42c5aefe4 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -132,7 +132,7 @@ impl DataTypeMap { SqlType::FLOAT, )), DataType::Timestamp(unit, tz) => Ok(DataTypeMap::new( - DataType::Timestamp(unit.clone(), tz.clone()), + DataType::Timestamp(*unit, tz.clone()), PythonType::Datetime, SqlType::DATE, )), @@ -147,12 +147,12 @@ impl DataTypeMap { SqlType::DATE, )), DataType::Time32(unit) => Ok(DataTypeMap::new( - DataType::Time32(unit.clone()), + DataType::Time32(*unit), PythonType::Datetime, SqlType::DATE, )), DataType::Time64(unit) => Ok(DataTypeMap::new( - DataType::Time64(unit.clone()), + DataType::Time64(*unit), PythonType::Datetime, SqlType::DATE, )), @@ -160,7 +160,7 @@ impl DataTypeMap { format!("{:?}", arrow_type), ))), DataType::Interval(interval_unit) => Ok(DataTypeMap::new( - DataType::Interval(interval_unit.clone()), + DataType::Interval(*interval_unit), PythonType::Datetime, match interval_unit { IntervalUnit::DayTime => SqlType::INTERVAL_DAY, From f5801557e0804df68ef39de6a422835cf82ac9d3 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Tue, 30 Jul 2024 22:47:56 -0500 Subject: [PATCH 004/248] Upgrade Datafusion 40 (#771) * chore: update datafusion deps * feat: impl ExecutionPlan::static_name() for DatasetExec This required trait method was added upstream [0] and recommends to simply forward to `static_name`. [0]: https://github.com/apache/datafusion/pull/10266 * feat: update first_value and last_value wrappers. Upstream signatures were changed for the new new `AggregateBuilder` api [0]. This simply gets the code to work. We should better incorporate that API into `datafusion-python`. [0] https://github.com/apache/datafusion/pull/10560 * migrate count to UDAF Builtin Count was removed upstream. TBD whether we want to re-implement `count_star` with new API. Ref: https://github.com/apache/datafusion/pull/10893 * migrate approx_percentile_cont, approx_distinct, and approx_median to UDAF Ref: approx_distinct https://github.com/apache/datafusion/pull/10851 Ref: approx_median https://github.com/apache/datafusion/pull/10840 Ref: approx_percentile_cont and _with_weight https://github.com/apache/datafusion/pull/10917 * migrate avg to UDAF Ref: https://github.com/apache/datafusion/pull/10964 * migrage corr to UDAF Ref: https://github.com/apache/datafusion/issues/10884 * migrate grouping to UDAF Ref: https://github.com/apache/datafusion/issues/10906 * add alias `mean` for UDAF `avg` * migrate stddev to UDAF Ref: https://github.com/apache/datafusion/issues/10827 * remove rust alias for stddev The python wrapper now provides stddev_samp alias. * migrage var_pop to UDAF Ref: https://github.com/apache/datafusion/pull/10836 * migrate regr_* functions to UDAF Ref: https://github.com/apache/datafusion/pull/10898 * migrate bitwise functions to UDAF The functions now take a single expression instead of a Vec<_>. Ref: https://github.com/apache/datafusion/pull/10930 * add missing variants for ScalarValue with todo * fix typo in approx_percentile_cont * add distinct arg to count * comment out failing test `approx_percentile_cont` is now returning a DoubleArray instead of an IntArray. This may be a bug upstream; it requires further investigation. * update tests to expect lowercase `sum` in query plans This was changed upstream. Ref: https://github.com/apache/datafusion/pull/10831 * update ScalarType data_type map * add docs dependency pickleshare * re-implement count_star * lint: ruff python lint * lint: rust cargo fmt * include name of window function in error for find_window_fn * refactor `find_window_fn` for debug clarity * search default aggregate functions by both name and aliases The alias list no longer includes the name of the function. Ref: https://github.com/apache/datafusion/issues/10658 * fix markdown in find_window_fn docs * parameterize test_window_functions `first_value` and `last_value` are currently failing and marked as xfail. * add test ids to test_simple_select tests marked xfail * update find_window_fn to search built-ins first The behavior of `first_value` and `last_value` UDAFs currently does not match the built-in behavior. This allowed me to remove `marks=pytest.xfail` from the window tests. * improve first_call and last_call use of the builder API * remove trailing todos * fix examples/substrait.py * chore: remove explicit aliases from functions.rs Ref: https://github.com/apache/datafusion-python/issues/779 * remove `array_fn!` aliases * remove alias rules for `expr_fn_vec!` * remove alias rules from `expr_fn!` macro * remove unnecessary pyo3 var-arg signatures in functions.rs * remove pyo3 signatures that provided defaults for first_value and last_value * parametrize test_string_functions * test regr_ function wrappers Closes #778 --- Cargo.lock | 69 ++- Cargo.toml | 16 +- docs/requirements.txt | 3 +- examples/substrait.py | 2 +- python/datafusion/functions.py | 68 +-- python/datafusion/tests/test_aggregation.py | 3 +- python/datafusion/tests/test_dataframe.py | 116 ++-- python/datafusion/tests/test_functions.py | 184 +++--- python/datafusion/tests/test_sql.py | 17 +- src/common/data_type.rs | 5 + src/dataset_exec.rs | 5 + src/functions.rs | 646 ++++++++++++-------- 12 files changed, 635 insertions(+), 499 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0979306ab..c41ef771a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -739,9 +739,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f92d2d7a9cba4580900b32b009848d9eb35f1028ac84cdd6ddcf97612cd0068" +checksum = "ab9d55a9cd2634818953809f75ebe5248b00dd43c3227efb2a51a2d5feaad54e" dependencies = [ "ahash", "apache-avro", @@ -795,9 +795,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "effed030d2c1667eb1e11df5372d4981eaf5d11a521be32220b3985ae5ba6971" +checksum = "def66b642959e7f96f5d2da22e1f43d3bd35598f821e5ce351a0553e0f1b7367" dependencies = [ "ahash", "apache-avro", @@ -819,18 +819,18 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0091318129dad1359f08e4c6c71f855163c35bba05d1dbf983196f727857894" +checksum = "f104bb9cb44c06c9badf8a0d7e0855e5f7fa5e395b887d7f835e8a9457dc1352" dependencies = [ "tokio", ] [[package]] name = "datafusion-execution" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8385aba84fc4a06d3ebccfbcbf9b4f985e80c762fac634b49079f7cc14933fb1" +checksum = "2ac0fd8b5d80bbca3fc3b6f40da4e9f6907354824ec3b18bbd83fee8cf5c3c3e" dependencies = [ "arrow", "chrono", @@ -849,9 +849,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebb192f0055d2ce64e38ac100abc18e4e6ae9734d3c28eee522bbbd6a32108a3" +checksum = "2103d2cc16fb11ef1fa993a6cac57ed5cb028601db4b97566c90e5fa77aa1e68" dependencies = [ "ahash", "arrow", @@ -868,9 +868,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27c081ae5b7edd712b92767fb8ed5c0e32755682f8075707666cd70835807c0b" +checksum = "a369332afd0ef5bd565f6db2139fb9f1dfdd0afa75a7f70f000b74208d76994f" dependencies = [ "arrow", "base64 0.22.1", @@ -880,7 +880,6 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", - "datafusion-physical-expr", "hashbrown", "hex", "itertools 0.12.1", @@ -895,9 +894,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feb28a4ea52c28a26990646986a27c4052829a2a2572386258679e19263f8b78" +checksum = "92718db1aff70c47e5abf9fc975768530097059e5db7c7b78cd64b5e9a11fc77" dependencies = [ "ahash", "arrow", @@ -913,9 +912,9 @@ dependencies = [ [[package]] name = "datafusion-functions-array" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b17c02a74cdc87380a56758ec27e7d417356bf806f33062700908929aedb8a" +checksum = "30bb80f46ff3dcf4bb4510209c2ba9b8ce1b716ac8b7bf70c6bf7dca6260c831" dependencies = [ "arrow", "arrow-array", @@ -926,6 +925,7 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-functions", + "datafusion-functions-aggregate", "itertools 0.12.1", "log", "paste", @@ -933,9 +933,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12172f2a6c9eb4992a51e62d709eeba5dedaa3b5369cce37ff6c2260e100ba76" +checksum = "82f34692011bec4fdd6fc18c264bf8037b8625d801e6dd8f5111af15cb6d71d3" dependencies = [ "arrow", "async-trait", @@ -947,14 +947,15 @@ dependencies = [ "indexmap", "itertools 0.12.1", "log", + "paste", "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a3fce531b623e94180f6cd33d620ef01530405751b6ddd2fd96250cdbd78e2e" +checksum = "45538630defedb553771434a437f7ca8f04b9b3e834344aafacecb27dc65d5e5" dependencies = [ "ahash", "arrow", @@ -968,7 +969,6 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", - "datafusion-functions-aggregate", "datafusion-physical-expr-common", "half", "hashbrown", @@ -983,21 +983,23 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "046400b6a2cc3ed57a7c576f5ae6aecc77804ac8e0186926b278b189305b2a77" +checksum = "9d8a72b0ca908e074aaeca52c14ddf5c28d22361e9cb6bc79bb733cd6661b536" dependencies = [ + "ahash", "arrow", "datafusion-common", "datafusion-expr", + "hashbrown", "rand", ] [[package]] name = "datafusion-physical-plan" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4aed47f5a2ad8766260befb375b201592e86a08b260256e168ae4311426a2bff" +checksum = "b504eae6107a342775e22e323e9103f7f42db593ec6103b28605b7b7b1405c4a" dependencies = [ "ahash", "arrow", @@ -1029,7 +1031,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "39.0.0" +version = "40.0.0" dependencies = [ "arrow", "async-trait", @@ -1059,9 +1061,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fa92bb1fd15e46ce5fb6f1c85f3ac054592560f294429a28e392b5f9cd4255e" +checksum = "e5db33f323f41b95ae201318ba654a9bf11113e58a51a1dff977b1a836d3d889" dependencies = [ "arrow", "arrow-array", @@ -1076,9 +1078,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "39.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8249d69665c1cd32e07789ed6dd1da6528a23019ef16d3483db52952b6f9f68a" +checksum = "434e52fbff22e6e04e6c787f603a6aba4961a7e249a29c743c5d4f609ec2dcef" dependencies = [ "arrow-buffer", "async-recursion", @@ -1089,6 +1091,7 @@ dependencies = [ "pbjson-types", "prost", "substrait", + "url", ] [[package]] @@ -2958,9 +2961,9 @@ dependencies = [ [[package]] name = "substrait" -version = "0.34.1" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04c77dec9b6c4e48ac828937bbe7cf473b0933168c5d76d51a5816ace7046be9" +checksum = "b1ee6e584c8bf37104b7eb51c25eae07a9321b0e01379bec3b7c462d2f42afbf" dependencies = [ "heck 0.5.0", "pbjson", diff --git a/Cargo.toml b/Cargo.toml index a77eca0c3..d05a617a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "39.0.0" +version = "40.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] @@ -38,13 +38,13 @@ tokio = { version = "1.35", features = ["macros", "rt", "rt-multi-thread", "sync rand = "0.8" pyo3 = { version = "0.21", features = ["extension-module", "abi3", "abi3-py38"] } arrow = { version = "52", feature = ["pyarrow"] } -datafusion = { version = "39.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } -datafusion-common = { version = "39.0.0", features = ["pyarrow"] } -datafusion-expr = "39.0.0" -datafusion-functions-array = "39.0.0" -datafusion-optimizer = "39.0.0" -datafusion-sql = "39.0.0" -datafusion-substrait = { version = "39.0.0", optional = true } +datafusion = { version = "40.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } +datafusion-common = { version = "40.0.0", features = ["pyarrow"] } +datafusion-expr = "40.0.0" +datafusion-functions-array = "40.0.0" +datafusion-optimizer = "40.0.0" +datafusion-sql = "40.0.0" +datafusion-substrait = { version = "40.0.0", optional = true } prost = "0.12" prost-types = "0.12" uuid = { version = "1.9", features = ["v4"] } diff --git a/docs/requirements.txt b/docs/requirements.txt index 67f1ec6ac..42bc4e517 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -21,4 +21,5 @@ myst-parser maturin jinja2 ipython -pandas \ No newline at end of file +pandas +pickleshare \ No newline at end of file diff --git a/examples/substrait.py b/examples/substrait.py index fd4d0f9ca..fa6f77912 100644 --- a/examples/substrait.py +++ b/examples/substrait.py @@ -46,4 +46,4 @@ # Back to Substrait Plan just for demonstration purposes # type(substrait_plan) -> -substrait_plan = ss.Producer.to_substrait_plan(df_logical_plan) +substrait_plan = ss.Producer.to_substrait_plan(df_logical_plan, ctx) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 46d2a2f04..ca41f5ff7 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -101,7 +101,7 @@ def concat(*args: Expr) -> Expr: NULL arguments are ignored. """ args = [arg.expr for arg in args] - return Expr(f.concat(*args)) + return Expr(f.concat(args)) def concat_ws(separator: str, *args: Expr) -> Expr: @@ -110,7 +110,7 @@ def concat_ws(separator: str, *args: Expr) -> Expr: `NULL` arugments are ignored. `separator` should not be `NULL`. """ args = [arg.expr for arg in args] - return Expr(f.concat_ws(separator, *args)) + return Expr(f.concat_ws(separator, args)) def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> Expr: @@ -757,7 +757,7 @@ def upper(arg: Expr) -> Expr: def make_array(*args: Expr) -> Expr: """Returns an array using the specified input expressions.""" args = [arg.expr for arg in args] - return Expr(f.make_array(*args)) + return Expr(f.make_array(args)) def array(*args: Expr) -> Expr: @@ -840,7 +840,7 @@ def list_push_back(array: Expr, element: Expr) -> Expr: def array_concat(*args: Expr) -> Expr: """Concatenates the input arrays.""" args = [arg.expr for arg in args] - return Expr(f.array_concat(*args)) + return Expr(f.array_concat(args)) def array_cat(*args: Expr) -> Expr: @@ -1233,9 +1233,9 @@ def flatten(array: Expr) -> Expr: # aggregate functions -def approx_distinct(arg: Expr) -> Expr: +def approx_distinct(expression: Expr) -> Expr: """Returns the approximate number of distinct values.""" - return Expr(f.approx_distinct(arg.expr, distinct=True)) + return Expr(f.approx_distinct(expression.expr)) def approx_median(arg: Expr, distinct: bool = False) -> Expr: @@ -1244,20 +1244,21 @@ def approx_median(arg: Expr, distinct: bool = False) -> Expr: def approx_percentile_cont( - expr: Expr, + expression: Expr, percentile: Expr, - num_centroids: int | None = None, distinct: bool = False, ) -> Expr: """Returns the value that is approximately at a given percentile of ``expr``.""" + # Re-enable num_centroids: https://github.com/apache/datafusion-python/issues/777 + num_centroids = None if num_centroids is None: return Expr( - f.approx_percentile_cont(expr.expr, percentile.expr, distinct=distinct) + f.approx_percentile_cont(expression.expr, percentile.expr, distinct=distinct) ) return Expr( f.approx_percentile_cont( - expr.expr, percentile.expr, num_centroids, distinct=distinct + expression.expr, percentile.expr, distinct=distinct ) ) @@ -1306,7 +1307,7 @@ def covar(y: Expr, x: Expr) -> Expr: This is an alias for `covar_samp`. """ - return Expr(f.covar(y.expr, x.expr)) + return covar_samp(y, x) def covar_pop(y: Expr, x: Expr) -> Expr: @@ -1324,7 +1325,7 @@ def grouping(arg: Expr, distinct: bool = False) -> Expr: Returns 1 if the value of the argument is aggregated, 0 if not. """ - return Expr(f.grouping([arg.expr], distinct=distinct)) + return Expr(f.grouping(arg.expr, distinct=distinct)) def max(arg: Expr, distinct: bool = False) -> Expr: @@ -1396,7 +1397,7 @@ def regr_avgx(y: Expr, x: Expr, distinct: bool = False) -> Expr: Only non-null pairs of the inputs are evaluated. """ - return Expr(f.regr_avgx[y.expr, x.expr], distinct) + return Expr(f.regr_avgx(y.expr, x.expr, distinct)) def regr_avgy(y: Expr, x: Expr, distinct: bool = False) -> Expr: @@ -1404,42 +1405,42 @@ def regr_avgy(y: Expr, x: Expr, distinct: bool = False) -> Expr: Only non-null pairs of the inputs are evaluated. """ - return Expr(f.regr_avgy[y.expr, x.expr], distinct) + return Expr(f.regr_avgy(y.expr, x.expr, distinct)) def regr_count(y: Expr, x: Expr, distinct: bool = False) -> Expr: """Counts the number of rows in which both expressions are not null.""" - return Expr(f.regr_count[y.expr, x.expr], distinct) + return Expr(f.regr_count(y.expr, x.expr, distinct)) def regr_intercept(y: Expr, x: Expr, distinct: bool = False) -> Expr: """Computes the intercept from the linear regression.""" - return Expr(f.regr_intercept[y.expr, x.expr], distinct) + return Expr(f.regr_intercept(y.expr, x.expr, distinct)) def regr_r2(y: Expr, x: Expr, distinct: bool = False) -> Expr: """Computes the R-squared value from linear regression.""" - return Expr(f.regr_r2[y.expr, x.expr], distinct) + return Expr(f.regr_r2(y.expr, x.expr, distinct)) def regr_slope(y: Expr, x: Expr, distinct: bool = False) -> Expr: """Computes the slope from linear regression.""" - return Expr(f.regr_slope[y.expr, x.expr], distinct) + return Expr(f.regr_slope(y.expr, x.expr, distinct)) def regr_sxx(y: Expr, x: Expr, distinct: bool = False) -> Expr: """Computes the sum of squares of the independent variable `x`.""" - return Expr(f.regr_sxx[y.expr, x.expr], distinct) + return Expr(f.regr_sxx(y.expr, x.expr, distinct)) def regr_sxy(y: Expr, x: Expr, distinct: bool = False) -> Expr: """Computes the sum of products of pairs of numbers.""" - return Expr(f.regr_sxy[y.expr, x.expr], distinct) + return Expr(f.regr_sxy(y.expr, x.expr, distinct)) def regr_syy(y: Expr, x: Expr, distinct: bool = False) -> Expr: """Computes the sum of squares of the dependent variable `y`.""" - return Expr(f.regr_syy[y.expr, x.expr], distinct) + return Expr(f.regr_syy(y.expr, x.expr, distinct)) def first_value( @@ -1480,31 +1481,26 @@ def last_value( ) -def bit_and(*args: Expr, distinct: bool = False) -> Expr: +def bit_and(arg: Expr, distinct: bool = False) -> Expr: """Computes the bitwise AND of the argument.""" - args = [arg.expr for arg in args] - return Expr(f.bit_and(*args, distinct=distinct)) + return Expr(f.bit_and(arg.expr, distinct=distinct)) -def bit_or(*args: Expr, distinct: bool = False) -> Expr: +def bit_or(arg: Expr, distinct: bool = False) -> Expr: """Computes the bitwise OR of the argument.""" - args = [arg.expr for arg in args] - return Expr(f.bit_or(*args, distinct=distinct)) + return Expr(f.bit_or(arg.expr, distinct=distinct)) -def bit_xor(*args: Expr, distinct: bool = False) -> Expr: +def bit_xor(arg: Expr, distinct: bool = False) -> Expr: """Computes the bitwise XOR of the argument.""" - args = [arg.expr for arg in args] - return Expr(f.bit_xor(*args, distinct=distinct)) + return Expr(f.bit_xor(arg.expr, distinct=distinct)) -def bool_and(*args: Expr, distinct: bool = False) -> Expr: +def bool_and(arg: Expr, distinct: bool = False) -> Expr: """Computes the boolean AND of the arugment.""" - args = [arg.expr for arg in args] - return Expr(f.bool_and(*args, distinct=distinct)) + return Expr(f.bool_and(arg.expr, distinct=distinct)) -def bool_or(*args: Expr, distinct: bool = False) -> Expr: +def bool_or(arg: Expr, distinct: bool = False) -> Expr: """Computes the boolean OR of the arguement.""" - args = [arg.expr for arg in args] - return Expr(f.bool_or(*args, distinct=distinct)) + return Expr(f.bool_or(arg.expr, distinct=distinct)) diff --git a/python/datafusion/tests/test_aggregation.py b/python/datafusion/tests/test_aggregation.py index 99a470b6b..c10e5f36c 100644 --- a/python/datafusion/tests/test_aggregation.py +++ b/python/datafusion/tests/test_aggregation.py @@ -79,7 +79,8 @@ def test_built_in_aggregation(df): assert result.column(0) == pa.array([2], type=pa.uint64()) assert result.column(1) == pa.array([4]) assert result.column(2) == pa.array([4]) - assert result.column(3) == pa.array([6]) + # Ref: https://github.com/apache/datafusion-python/issues/777 + # assert result.column(3) == pa.array([6]) assert result.column(4) == pa.array([[4, 4, 6]]) np.testing.assert_array_almost_equal(result.column(5), np.average(values_a)) np.testing.assert_array_almost_equal( diff --git a/python/datafusion/tests/test_dataframe.py b/python/datafusion/tests/test_dataframe.py index 25875da77..6444d9321 100644 --- a/python/datafusion/tests/test_dataframe.py +++ b/python/datafusion/tests/test_dataframe.py @@ -278,63 +278,48 @@ def test_distinct(): assert df_a.collect() == df_b.collect() -def test_window_functions(df): +data_test_window_functions = [ + ("row", f.window("row_number", [], order_by=[f.order_by(column("c"))]), [2, 1, 3]), + ("rank", f.window("rank", [], order_by=[f.order_by(column("c"))]), [2, 1, 2]), + ("dense_rank", f.window("dense_rank", [], order_by=[f.order_by(column("c"))]), [2, 1, 2] ), + ("percent_rank", f.window("percent_rank", [], order_by=[f.order_by(column("c"))]), [0.5, 0, 0.5]), + ("cume_dist", f.window("cume_dist", [], order_by=[f.order_by(column("b"))]), [0.3333333333333333, 0.6666666666666666, 1.0]), + ("ntile", f.window("ntile", [literal(2)], order_by=[f.order_by(column("c"))]), [1, 1, 2]), + ("next", f.window("lead", [column("b")], order_by=[f.order_by(column("b"))]), [5, 6, None]), + ("previous", f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]), [None, 4, 5]), + pytest.param( + "first_value", + f.window( + "first_value", + [column("a")], + order_by=[f.order_by(column("b"))] + ), + [1, 1, 1], + ), + pytest.param( + "last_value", + f.window("last_value", [column("b")], order_by=[f.order_by(column("b"))]), + [4, 5, 6], + ), + pytest.param( + "2nd_value", + f.window( + "nth_value", + [column("b"), literal(2)], + order_by=[f.order_by(column("b"))], + ), + [None, 5, 5], + ), +] + + +@pytest.mark.parametrize("name,expr,result", data_test_window_functions) +def test_window_functions(df, name, expr, result): df = df.select( column("a"), column("b"), column("c"), - f.alias( - f.window("row_number", [], order_by=[f.order_by(column("c"))]), - "row", - ), - f.alias( - f.window("rank", [], order_by=[f.order_by(column("c"))]), - "rank", - ), - f.alias( - f.window("dense_rank", [], order_by=[f.order_by(column("c"))]), - "dense_rank", - ), - f.alias( - f.window("percent_rank", [], order_by=[f.order_by(column("c"))]), - "percent_rank", - ), - f.alias( - f.window("cume_dist", [], order_by=[f.order_by(column("b"))]), - "cume_dist", - ), - f.alias( - f.window("ntile", [literal(2)], order_by=[f.order_by(column("c"))]), - "ntile", - ), - f.alias( - f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]), - "previous", - ), - f.alias( - f.window("lead", [column("b")], order_by=[f.order_by(column("b"))]), - "next", - ), - f.alias( - f.window( - "first_value", - [column("a")], - order_by=[f.order_by(column("b"))], - ), - "first_value", - ), - f.alias( - f.window("last_value", [column("b")], order_by=[f.order_by(column("b"))]), - "last_value", - ), - f.alias( - f.window( - "nth_value", - [column("b"), literal(2)], - order_by=[f.order_by(column("b"))], - ), - "2nd_value", - ), + f.alias(expr, name) ) table = pa.Table.from_batches(df.collect()) @@ -343,18 +328,9 @@ def test_window_functions(df): "a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8], - "row": [2, 1, 3], - "rank": [2, 1, 2], - "dense_rank": [2, 1, 2], - "percent_rank": [0.5, 0, 0.5], - "cume_dist": [0.3333333333333333, 0.6666666666666666, 1.0], - "ntile": [1, 1, 2], - "next": [5, 6, None], - "previous": [None, 4, 5], - "first_value": [1, 1, 1], - "last_value": [4, 5, 6], - "2nd_value": [None, 5, 5], + name: result } + assert table.sort_by("a").to_pydict() == expected @@ -434,13 +410,13 @@ def test_explain(df): def test_logical_plan(aggregate_df): plan = aggregate_df.logical_plan() - expected = "Projection: test.c1, SUM(test.c2)" + expected = "Projection: test.c1, sum(test.c2)" assert expected == plan.display() expected = ( - "Projection: test.c1, SUM(test.c2)\n" - " Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n" + "Projection: test.c1, sum(test.c2)\n" + " Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n" " TableScan: test" ) @@ -450,12 +426,12 @@ def test_logical_plan(aggregate_df): def test_optimized_logical_plan(aggregate_df): plan = aggregate_df.optimized_logical_plan() - expected = "Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]" + expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]" assert expected == plan.display() expected = ( - "Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n" + "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n" " TableScan: test projection=[c1, c2]" ) @@ -466,7 +442,7 @@ def test_execution_plan(aggregate_df): plan = aggregate_df.execution_plan() expected = ( - "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[SUM(test.c2)]\n" # noqa: E501 + "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n" # noqa: E501 ) assert expected == plan.display() diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index 25d7de14a..293912321 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -567,87 +567,51 @@ def test_array_function_obj_tests(stmt, py_expr): assert a == b -def test_string_functions(df): - df = df.select( - f.ascii(column("a")), - f.bit_length(column("a")), - f.btrim(literal(" World ")), - f.character_length(column("a")), - f.chr(literal(68)), - f.concat_ws("-", column("a"), literal("test")), - f.concat(column("a"), literal("?")), - f.initcap(column("c")), - f.left(column("a"), literal(3)), - f.length(column("c")), - f.lower(column("a")), - f.lpad(column("a"), literal(7)), - f.ltrim(column("c")), - f.md5(column("a")), - f.octet_length(column("a")), - f.repeat(column("a"), literal(2)), - f.replace(column("a"), literal("l"), literal("?")), - f.reverse(column("a")), - f.right(column("a"), literal(4)), - f.rpad(column("a"), literal(8)), - f.rtrim(column("c")), - f.split_part(column("a"), literal("l"), literal(1)), - f.starts_with(column("a"), literal("Wor")), - f.strpos(column("a"), literal("o")), - f.substr(column("a"), literal(3)), - f.translate(column("a"), literal("or"), literal("ld")), - f.trim(column("c")), - f.upper(column("c")), - f.ends_with(column("a"), literal("llo")), - f.overlay(column("a"), literal("--"), literal(2)), - f.regexp_like(column("a"), literal("(ell|orl)")), - f.regexp_match(column("a"), literal("(ell|orl)")), - f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), - ) - +@pytest.mark.parametrize("function, expected_result", [ + (f.ascii(column("a")), pa.array([72, 87, 33], type=pa.int32())), # H = 72; W = 87; ! = 33 + (f.bit_length(column("a")), pa.array([40, 40, 8], type=pa.int32())), + (f.btrim(literal(" World ")), pa.array(["World", "World", "World"])), + (f.character_length(column("a")), pa.array([5, 5, 1], type=pa.int32())), + (f.chr(literal(68)), pa.array(["D", "D", "D"])), + (f.concat_ws("-", column("a"), literal("test")), pa.array(["Hello-test", "World-test", "!-test"])), + (f.concat(column("a"), literal("?")), pa.array(["Hello?", "World?", "!?"])), + (f.initcap(column("c")), pa.array(["Hello ", " World ", " !"])), + (f.left(column("a"), literal(3)), pa.array(["Hel", "Wor", "!"])), + (f.length(column("c")), pa.array([6, 7, 2], type=pa.int32())), + (f.lower(column("a")), pa.array(["hello", "world", "!"])), + (f.lpad(column("a"), literal(7)), pa.array([" Hello", " World", " !"])), + (f.ltrim(column("c")), pa.array(["hello ", "world ", "!"])), + (f.md5(column("a")), pa.array([ + "8b1a9953c4611296a827abf8c47804d7", + "f5a7924e621e84c9280a9a27e1bcb7f6", + "9033e0e305f247c0c3c80d0c7848c8b3", + ])), + (f.octet_length(column("a")), pa.array([5, 5, 1], type=pa.int32())), + (f.repeat(column("a"), literal(2)), pa.array(["HelloHello", "WorldWorld", "!!"])), + (f.replace(column("a"), literal("l"), literal("?")), pa.array(["He??o", "Wor?d", "!"])), + (f.reverse(column("a")), pa.array(["olleH", "dlroW", "!"])), + (f.right(column("a"), literal(4)), pa.array(["ello", "orld", "!"])), + (f.rpad(column("a"), literal(8)), pa.array(["Hello ", "World ", "! "])), + (f.rtrim(column("c")), pa.array(["hello", " world", " !"])), + (f.split_part(column("a"), literal("l"), literal(1)), pa.array(["He", "Wor", "!"])), + (f.starts_with(column("a"), literal("Wor")), pa.array([False, True, False])), + (f.strpos(column("a"), literal("o")), pa.array([5, 2, 0], type=pa.int32())), + (f.substr(column("a"), literal(3)), pa.array(["llo", "rld", ""])), + (f.translate(column("a"), literal("or"), literal("ld")), pa.array(["Helll", "Wldld", "!"])), + (f.trim(column("c")), pa.array(["hello", "world", "!"])), + (f.upper(column("c")), pa.array(["HELLO ", " WORLD ", " !"])), + (f.ends_with(column("a"), literal("llo")), pa.array([True, False, False])), + (f.overlay(column("a"), literal("--"), literal(2)), pa.array(["H--lo", "W--ld", "--"])), + (f.regexp_like(column("a"), literal("(ell|orl)")), pa.array([True, True, False])), + (f.regexp_match(column("a"), literal("(ell|orl)")), pa.array([["ell"], ["orl"], None])), + (f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), pa.array(["H-o", "W-d", "!"])), +]) +def test_string_functions(df, function, expected_result): + df = df.select(function) result = df.collect() assert len(result) == 1 result = result[0] - assert result.column(0) == pa.array( - [72, 87, 33], type=pa.int32() - ) # H = 72; W = 87; ! = 33 - assert result.column(1) == pa.array([40, 40, 8], type=pa.int32()) - assert result.column(2) == pa.array(["World", "World", "World"]) - assert result.column(3) == pa.array([5, 5, 1], type=pa.int32()) - assert result.column(4) == pa.array(["D", "D", "D"]) - assert result.column(5) == pa.array(["Hello-test", "World-test", "!-test"]) - assert result.column(6) == pa.array(["Hello?", "World?", "!?"]) - assert result.column(7) == pa.array(["Hello ", " World ", " !"]) - assert result.column(8) == pa.array(["Hel", "Wor", "!"]) - assert result.column(9) == pa.array([6, 7, 2], type=pa.int32()) - assert result.column(10) == pa.array(["hello", "world", "!"]) - assert result.column(11) == pa.array([" Hello", " World", " !"]) - assert result.column(12) == pa.array(["hello ", "world ", "!"]) - assert result.column(13) == pa.array( - [ - "8b1a9953c4611296a827abf8c47804d7", - "f5a7924e621e84c9280a9a27e1bcb7f6", - "9033e0e305f247c0c3c80d0c7848c8b3", - ] - ) - assert result.column(14) == pa.array([5, 5, 1], type=pa.int32()) - assert result.column(15) == pa.array(["HelloHello", "WorldWorld", "!!"]) - assert result.column(16) == pa.array(["He??o", "Wor?d", "!"]) - assert result.column(17) == pa.array(["olleH", "dlroW", "!"]) - assert result.column(18) == pa.array(["ello", "orld", "!"]) - assert result.column(19) == pa.array(["Hello ", "World ", "! "]) - assert result.column(20) == pa.array(["hello", " world", " !"]) - assert result.column(21) == pa.array(["He", "Wor", "!"]) - assert result.column(22) == pa.array([False, True, False]) - assert result.column(23) == pa.array([5, 2, 0], type=pa.int32()) - assert result.column(24) == pa.array(["llo", "rld", ""]) - assert result.column(25) == pa.array(["Helll", "Wldld", "!"]) - assert result.column(26) == pa.array(["hello", "world", "!"]) - assert result.column(27) == pa.array(["HELLO ", " WORLD ", " !"]) - assert result.column(28) == pa.array([True, False, False]) - assert result.column(29) == pa.array(["H--lo", "W--ld", "--"]) - assert result.column(30) == pa.array([True, True, False]) - assert result.column(31) == pa.array([["ell"], ["orl"], None]) - assert result.column(32) == pa.array(["H-o", "W-d", "!"]) + assert result.column(0) == expected_result def test_hash_functions(df): @@ -831,7 +795,7 @@ def test_case(df): assert result.column(2) == pa.array(["Hola", "Mundo", None]) -def test_regr_funcs(df): +def test_regr_funcs_sql(df): # test case base on # https://github.com/apache/arrow-datafusion/blob/d1361d56b9a9e0c165d3d71a8df6795d2a5f51dd/datafusion/core/tests/sqllogictests/test_files/aggregate.slt#L2330 ctx = SessionContext() @@ -853,6 +817,68 @@ def test_regr_funcs(df): assert result[0].column(8) == pa.array([0], type=pa.float64()) +def test_regr_funcs_sql_2(): + # test case based on `regr_*() basic tests + # https://github.com/apache/datafusion/blob/d1361d56b9a9e0c165d3d71a8df6795d2a5f51dd/datafusion/core/tests/sqllogictests/test_files/aggregate.slt#L2358C1-L2374C1 + ctx = SessionContext() + + # Perform the regression functions using SQL + result_sql = ctx.sql( + "select " + "regr_slope(column2, column1), " + "regr_intercept(column2, column1), " + "regr_count(column2, column1), " + "regr_r2(column2, column1), " + "regr_avgx(column2, column1), " + "regr_avgy(column2, column1), " + "regr_sxx(column2, column1), " + "regr_syy(column2, column1), " + "regr_sxy(column2, column1) " + "from (values (1,2), (2,4), (3,6))" + ).collect() + + # Assertions for SQL results + assert result_sql[0].column(0) == pa.array([2], type=pa.float64()) + assert result_sql[0].column(1) == pa.array([0], type=pa.float64()) + assert result_sql[0].column(2) == pa.array([3], type=pa.float64()) # todo: i would not expect this to be float + assert result_sql[0].column(3) == pa.array([1], type=pa.float64()) + assert result_sql[0].column(4) == pa.array([2], type=pa.float64()) + assert result_sql[0].column(5) == pa.array([4], type=pa.float64()) + assert result_sql[0].column(6) == pa.array([2], type=pa.float64()) + assert result_sql[0].column(7) == pa.array([8], type=pa.float64()) + assert result_sql[0].column(8) == pa.array([4], type=pa.float64()) + + +@pytest.mark.parametrize("func, expected", [ + pytest.param(f.regr_slope, pa.array([2], type=pa.float64()), id="regr_slope"), + pytest.param(f.regr_intercept, pa.array([0], type=pa.float64()), id="regr_intercept"), + pytest.param(f.regr_count, pa.array([3], type=pa.float64()), id="regr_count"), # TODO: I would expect this to return an int array + pytest.param(f.regr_r2, pa.array([1], type=pa.float64()), id="regr_r2"), + pytest.param(f.regr_avgx, pa.array([2], type=pa.float64()), id="regr_avgx"), + pytest.param(f.regr_avgy, pa.array([4], type=pa.float64()), id="regr_avgy"), + pytest.param(f.regr_sxx, pa.array([2], type=pa.float64()), id="regr_sxx"), + pytest.param(f.regr_syy, pa.array([8], type=pa.float64()), id="regr_syy"), + pytest.param(f.regr_sxy, pa.array([4], type=pa.float64()), id="regr_sxy") +]) +def test_regr_funcs_df(func, expected): + + # test case based on `regr_*() basic tests + # https://github.com/apache/datafusion/blob/d1361d56b9a9e0c165d3d71a8df6795d2a5f51dd/datafusion/core/tests/sqllogictests/test_files/aggregate.slt#L2358C1-L2374C1 + + + ctx = SessionContext() + + # Create a DataFrame + data = {'column1': [1, 2, 3], 'column2': [2, 4, 6]} + df = ctx.from_pydict(data, name="test_table") + + # Perform the regression function using DataFrame API + result_df = df.aggregate([], [func(f.col("column2"), f.col("column1"))]).collect() + + # Assertion for DataFrame API result + assert result_df[0].column(0) == expected + + def test_first_last_value(df): df = df.aggregate( [], diff --git a/python/datafusion/tests/test_sql.py b/python/datafusion/tests/test_sql.py index d85f380e7..1505fb1e7 100644 --- a/python/datafusion/tests/test_sql.py +++ b/python/datafusion/tests/test_sql.py @@ -378,17 +378,18 @@ def test_udf( # C data interface missing pytest.param( pa.array([b"1111", b"2222", b"3333"], pa.binary(4), _null_mask), + id="binary4", marks=pytest.mark.xfail, ), - pytest.param(helpers.data_datetime("s"), marks=pytest.mark.xfail), - pytest.param(helpers.data_datetime("ms"), marks=pytest.mark.xfail), - pytest.param(helpers.data_datetime("us"), marks=pytest.mark.xfail), - pytest.param(helpers.data_datetime("ns"), marks=pytest.mark.xfail), + pytest.param(helpers.data_datetime("s"), id="datetime_s", marks=pytest.mark.xfail), + pytest.param(helpers.data_datetime("ms"), id="datetime_ms", marks=pytest.mark.xfail), + pytest.param(helpers.data_datetime("us"), id="datetime_us", marks=pytest.mark.xfail), + pytest.param(helpers.data_datetime("ns"), id="datetime_ns", marks=pytest.mark.xfail), # Not writtable to parquet - pytest.param(helpers.data_timedelta("s"), marks=pytest.mark.xfail), - pytest.param(helpers.data_timedelta("ms"), marks=pytest.mark.xfail), - pytest.param(helpers.data_timedelta("us"), marks=pytest.mark.xfail), - pytest.param(helpers.data_timedelta("ns"), marks=pytest.mark.xfail), + pytest.param(helpers.data_timedelta("s"), id="timedelta_s", marks=pytest.mark.xfail), + pytest.param(helpers.data_timedelta("ms"), id="timedelta_ms", marks=pytest.mark.xfail), + pytest.param(helpers.data_timedelta("us"), id="timedelta_us", marks=pytest.mark.xfail), + pytest.param(helpers.data_timedelta("ns"), id="timedelta_ns", marks=pytest.mark.xfail), ], ) def test_simple_select(ctx, tmp_path, arr): diff --git a/src/common/data_type.rs b/src/common/data_type.rs index 42c5aefe4..469bb789a 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -326,6 +326,11 @@ impl DataTypeMap { ScalarValue::Union(_, _, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( "ScalarValue::LargeList".to_string(), ))), + ScalarValue::Utf8View(_) => Ok(DataType::Utf8View), + ScalarValue::BinaryView(_) => Ok(DataType::BinaryView), + ScalarValue::Map(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( + "ScalarValue::Map".to_string(), + ))), } } } diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index 240c86486..5fe1f4d1b 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -152,6 +152,11 @@ impl DatasetExec { } impl ExecutionPlan for DatasetExec { + fn name(&self) -> &str { + // [ExecutionPlan::name] docs recommends forwarding to `static_name` + Self::static_name() + } + /// Return a reference to Any that can be used for downcasting fn as_any(&self) -> &dyn Any { self diff --git a/src/functions.rs b/src/functions.rs index 74eb48a62..e60c63c8e 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -16,6 +16,7 @@ // under the License. use datafusion::functions_aggregate::all_default_aggregate_functions; +use datafusion_expr::AggregateExt; use pyo3::{prelude::*, wrap_pyfunction}; use crate::common::data_type::NullTreatment; @@ -30,13 +31,141 @@ use datafusion::functions_aggregate; use datafusion_common::{Column, ScalarValue, TableReference}; use datafusion_expr::expr::Alias; use datafusion_expr::{ - aggregate_function, expr::{ find_df_window_func, AggregateFunction, AggregateFunctionDefinition, Sort, WindowFunction, }, lit, Expr, WindowFunctionDefinition, }; +#[pyfunction] +pub fn approx_distinct(expression: PyExpr) -> PyExpr { + functions_aggregate::expr_fn::approx_distinct::approx_distinct(expression.expr).into() +} + +#[pyfunction] +pub fn approx_median(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::approx_median(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn approx_percentile_cont( + expression: PyExpr, + percentile: PyExpr, + distinct: bool, +) -> PyResult { + let expr = + functions_aggregate::expr_fn::approx_percentile_cont(expression.expr, percentile.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn approx_percentile_cont_with_weight( + expression: PyExpr, + weight: PyExpr, + percentile: PyExpr, + distinct: bool, +) -> PyResult { + let expr = functions_aggregate::expr_fn::approx_percentile_cont_with_weight( + expression.expr, + weight.expr, + percentile.expr, + ); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn avg(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::avg(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn bit_and(expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::bit_and(expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn bit_or(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::bit_or(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn bit_xor(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::bit_xor(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn bool_and(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::bool_and(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn bool_or(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::bool_or(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn corr(y: PyExpr, x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::corr(y.expr, x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn grouping(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::grouping(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + #[pyfunction] pub fn sum(args: PyExpr) -> PyExpr { functions_aggregate::expr_fn::sum(args.expr).into() @@ -58,9 +187,23 @@ pub fn median(arg: PyExpr) -> PyExpr { } #[pyfunction] -pub fn covar(y: PyExpr, x: PyExpr) -> PyExpr { - // alias for covar_samp - covar_samp(y, x) +pub fn stddev(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::stddev(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn stddev_pop(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::stddev_pop(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } } #[pyfunction] @@ -69,53 +212,166 @@ pub fn var_samp(expression: PyExpr) -> PyExpr { } #[pyfunction] -/// Alias for [`var_samp`] -pub fn var(y: PyExpr) -> PyExpr { - var_samp(y) +pub fn var_pop(expression: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::var_pop(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_avgx(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::regr_avgx(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_avgy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::regr_avgy(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_count(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::regr_count(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_intercept(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::regr_intercept(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_r2(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::regr_r2(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_slope(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::regr_slope(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_sxx(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::regr_sxx(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_sxy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::regr_sxy(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_syy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::regr_syy(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } } #[pyfunction] -#[pyo3(signature = (*args, distinct = false, filter = None, order_by = None, null_treatment = None))] pub fn first_value( - args: Vec, + expr: PyExpr, distinct: bool, filter: Option, order_by: Option>, null_treatment: Option, -) -> PyExpr { - let null_treatment = null_treatment.map(Into::into); - let args = args.into_iter().map(|x| x.expr).collect::>(); - let order_by = order_by.map(|x| x.into_iter().map(|x| x.expr).collect::>()); - functions_aggregate::expr_fn::first_value( - args, - distinct, - filter.map(|x| Box::new(x.expr)), - order_by, - null_treatment, - ) - .into() +) -> PyResult { + // If we initialize the UDAF with order_by directly, then it gets over-written by the builder + let agg_fn = functions_aggregate::expr_fn::first_value(expr.expr, None); + + // luckily, I can guarantee initializing a builder with an `order_by` default of empty vec + let order_by = order_by + .map(|x| x.into_iter().map(|x| x.expr).collect::>()) + .unwrap_or_default(); + let mut builder = agg_fn.order_by(order_by); + + if distinct { + builder = builder.distinct(); + } + + if let Some(filter) = filter { + builder = builder.filter(filter.expr); + } + + if let Some(null_treatment) = null_treatment { + builder = builder.null_treatment(null_treatment.into()) + } + + Ok(builder.build()?.into()) } #[pyfunction] -#[pyo3(signature = (*args, distinct = false, filter = None, order_by = None, null_treatment = None))] pub fn last_value( - args: Vec, + expr: PyExpr, distinct: bool, filter: Option, order_by: Option>, null_treatment: Option, -) -> PyExpr { - let null_treatment = null_treatment.map(Into::into); - let args = args.into_iter().map(|x| x.expr).collect::>(); - let order_by = order_by.map(|x| x.into_iter().map(|x| x.expr).collect::>()); - functions_aggregate::expr_fn::last_value( - args, - distinct, - filter.map(|x| Box::new(x.expr)), - order_by, - null_treatment, - ) - .into() +) -> PyResult { + let agg_fn = functions_aggregate::expr_fn::last_value(vec![expr.expr]); + + // luckily, I can guarantee initializing a builder with an `order_by` default of empty vec + let order_by = order_by + .map(|x| x.into_iter().map(|x| x.expr).collect::>()) + .unwrap_or_default(); + let mut builder = agg_fn.order_by(order_by); + + if distinct { + builder = builder.distinct(); + } + + if let Some(filter) = filter { + builder = builder.filter(filter.expr); + } + + if let Some(null_treatment) = null_treatment { + builder = builder.null_treatment(null_treatment.into()) + } + + Ok(builder.build()?.into()) } #[pyfunction] @@ -129,34 +385,23 @@ fn in_list(expr: PyExpr, value: Vec, negated: bool) -> PyExpr { } #[pyfunction] -#[pyo3(signature = (*exprs))] fn make_array(exprs: Vec) -> PyExpr { datafusion_functions_array::expr_fn::make_array(exprs.into_iter().map(|x| x.into()).collect()) .into() } #[pyfunction] -#[pyo3(signature = (*exprs))] -fn array(exprs: Vec) -> PyExpr { - // alias for make_array - make_array(exprs) -} - -#[pyfunction] -#[pyo3(signature = (*exprs))] fn array_concat(exprs: Vec) -> PyExpr { let exprs = exprs.into_iter().map(|x| x.into()).collect(); datafusion_functions_array::expr_fn::array_concat(exprs).into() } #[pyfunction] -#[pyo3(signature = (*exprs))] fn array_cat(exprs: Vec) -> PyExpr { array_concat(exprs) } #[pyfunction] -#[pyo3(signature = (array, element, index = 1))] fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { let index = ScalarValue::Int64(index); let index = Expr::Literal(index); @@ -164,28 +409,6 @@ fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr } #[pyfunction] -#[pyo3(signature = (array, element, index = 1))] -fn array_indexof(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { - // alias of array_position - array_position(array, element, index) -} - -#[pyfunction] -#[pyo3(signature = (array, element, index = 1))] -fn list_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { - // alias of array_position - array_position(array, element, index) -} - -#[pyfunction] -#[pyo3(signature = (array, element, index = 1))] -fn list_indexof(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { - // alias of array_position - array_position(array, element, index) -} - -#[pyfunction] -#[pyo3(signature = (array, begin, end, stride = None))] fn array_slice(array: PyExpr, begin: PyExpr, end: PyExpr, stride: Option) -> PyExpr { datafusion_functions_array::expr_fn::array_slice( array.into(), @@ -196,18 +419,10 @@ fn array_slice(array: PyExpr, begin: PyExpr, end: PyExpr, stride: Option .into() } -#[pyfunction] -#[pyo3(signature = (array, begin, end, stride = None))] -fn list_slice(array: PyExpr, begin: PyExpr, end: PyExpr, stride: Option) -> PyExpr { - // alias of array_slice - array_slice(array, begin, end, stride) -} - /// Computes a binary hash of the given data. type is the algorithm to use. /// Standard algorithms are md5, sha224, sha256, sha384, sha512, blake2s, blake2b, and blake3. // #[pyfunction(value, method)] #[pyfunction] -#[pyo3(signature = (value, method))] fn digest(value: PyExpr, method: PyExpr) -> PyExpr { PyExpr { expr: functions::expr_fn::digest(value.expr, method.expr), @@ -217,7 +432,6 @@ fn digest(value: PyExpr, method: PyExpr) -> PyExpr { /// Concatenates the text representations of all the arguments. /// NULL arguments are ignored. #[pyfunction] -#[pyo3(signature = (*args))] fn concat(args: Vec) -> PyResult { let args = args.into_iter().map(|e| e.expr).collect::>(); Ok(functions::string::expr_fn::concat(args).into()) @@ -227,20 +441,17 @@ fn concat(args: Vec) -> PyResult { /// The first argument is used as the separator string, and should not be NULL. /// Other NULL arguments are ignored. #[pyfunction] -#[pyo3(signature = (sep, *args))] fn concat_ws(sep: String, args: Vec) -> PyResult { let args = args.into_iter().map(|e| e.expr).collect::>(); Ok(functions::string::expr_fn::concat_ws(lit(sep), args).into()) } #[pyfunction] -#[pyo3(signature = (values, regex, flags = None))] fn regexp_like(values: PyExpr, regex: PyExpr, flags: Option) -> PyResult { Ok(functions::expr_fn::regexp_like(values.expr, regex.expr, flags.map(|x| x.expr)).into()) } #[pyfunction] -#[pyo3(signature = (values, regex, flags = None))] fn regexp_match(values: PyExpr, regex: PyExpr, flags: Option) -> PyResult { Ok(functions::expr_fn::regexp_match(values.expr, regex.expr, flags.map(|x| x.expr)).into()) } @@ -293,21 +504,23 @@ fn col(name: &str) -> PyResult { }) } +// TODO: should we just expose this in python? /// Create a COUNT(1) aggregate expression #[pyfunction] -fn count_star() -> PyResult { - Ok(PyExpr { - expr: Expr::AggregateFunction(AggregateFunction { - func_def: datafusion_expr::expr::AggregateFunctionDefinition::BuiltIn( - aggregate_function::AggregateFunction::Count, - ), - args: vec![lit(1)], - distinct: false, - filter: None, - order_by: None, - null_treatment: None, - }), - }) +fn count_star() -> PyExpr { + functions_aggregate::expr_fn::count(lit(1)).into() +} + +/// Wrapper for [`functions_aggregate::expr_fn::count`] +/// Count the number of non-null values in the column +#[pyfunction] +fn count(expr: PyExpr, distinct: bool) -> PyResult { + let expr = functions_aggregate::expr_fn::count(expr.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } } /// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression. @@ -318,48 +531,70 @@ fn case(expr: PyExpr) -> PyResult { }) } -/// Helper function to find the appropriate window function. First, if a session -/// context is defined check it's registered functions. If no context is defined, -/// attempt to find from all default functions. Lastly, as a fall back attempt -/// to use built in window functions, which are being deprecated. +/// Helper function to find the appropriate window function. +/// +/// Search procedure: +/// 1) Search built in window functions, which are being deprecated. +/// 1) If a session context is provided: +/// 1) search User Defined Aggregate Functions (UDAFs) +/// 1) search registered window functions +/// 1) search registered aggregate functions +/// 1) If no function has been found, search default aggregate functions. +/// +/// NOTE: we search the built-ins first because the `UDAF` versions currently do not have the same behavior. fn find_window_fn(name: &str, ctx: Option) -> PyResult { - let mut maybe_fn = match &ctx { - Some(ctx) => { - let session_state = ctx.ctx.state(); - - match session_state.window_functions().contains_key(name) { - true => session_state - .window_functions() - .get(name) - .map(|f| WindowFunctionDefinition::WindowUDF(f.clone())), - false => session_state - .aggregate_functions() - .get(name) - .map(|f| WindowFunctionDefinition::AggregateUDF(f.clone())), - } + // search built in window functions (soon to be deprecated) + let df_window_func = find_df_window_func(name); + if let Some(df_window_func) = df_window_func { + return Ok(df_window_func); + } + + if let Some(ctx) = ctx { + // search UDAFs + let udaf = ctx + .ctx + .udaf(name) + .map(WindowFunctionDefinition::AggregateUDF) + .ok(); + + if let Some(udaf) = udaf { + return Ok(udaf); } - None => { - let default_aggregate_fns = all_default_aggregate_functions(); - default_aggregate_fns - .iter() - .find(|v| v.aliases().contains(&name.to_string())) - .map(|f| WindowFunctionDefinition::AggregateUDF(f.clone())) + let session_state = ctx.ctx.state(); + + // search registered window functions + let window_fn = session_state + .window_functions() + .get(name) + .map(|f| WindowFunctionDefinition::WindowUDF(f.clone())); + + if let Some(window_fn) = window_fn { + return Ok(window_fn); } - }; - if maybe_fn.is_none() { - maybe_fn = find_df_window_func(name).or_else(|| { - ctx.and_then(|ctx| { - ctx.ctx - .udaf(name) - .map(WindowFunctionDefinition::AggregateUDF) - .ok() - }) - }); + // search registered aggregate functions + let agg_fn = session_state + .aggregate_functions() + .get(name) + .map(|f| WindowFunctionDefinition::AggregateUDF(f.clone())); + + if let Some(agg_fn) = agg_fn { + return Ok(agg_fn); + } } - maybe_fn.ok_or(DataFusionError::Common("window function not found".to_string()).into()) + // search default aggregate functions + let agg_fn = all_default_aggregate_functions() + .iter() + .find(|v| v.name() == name || v.aliases().contains(&name.to_string())) + .map(|f| WindowFunctionDefinition::AggregateUDF(f.clone())); + + if let Some(agg_fn) = agg_fn { + return Ok(agg_fn); + } + + Err(DataFusionError::Common(format!("window function `{name}` not found")).into()) } /// Creates a new Window function expression @@ -424,25 +659,19 @@ macro_rules! aggregate_function { /// /// These functions have explicit named arguments. macro_rules! expr_fn { - ($NAME: ident) => { - expr_fn!($NAME, $NAME, , stringify!($NAME)); - }; - ($NAME:ident, $($arg:ident)*) => { - expr_fn!($NAME, $NAME, $($arg)*, stringify!($FUNC)); - }; - ($NAME:ident, $FUNC:ident, $($arg:ident)*) => { - expr_fn!($NAME, $FUNC, $($arg)*, stringify!($FUNC)); + ($FUNC: ident) => { + expr_fn!($FUNC, , stringify!($FUNC)); }; - ($NAME: ident, $DOC: expr) => { - expr_fn!($NAME, $NAME, ,$DOC); + ($FUNC:ident, $($arg:ident)*) => { + expr_fn!($FUNC, $($arg)*, stringify!($FUNC)); }; - ($NAME: ident, $($arg:ident)*, $DOC: expr) => { - expr_fn!($NAME, $NAME, $($arg)* ,$DOC); + ($FUNC: ident, $DOC: expr) => { + expr_fn!($FUNC, ,$DOC); }; - ($NAME: ident, $FUNC: ident, $($arg:ident)*, $DOC: expr) => { + ($FUNC: ident, $($arg:ident)*, $DOC: expr) => { #[doc = $DOC] #[pyfunction] - fn $NAME($($arg: PyExpr),*) -> PyExpr { + fn $FUNC($($arg: PyExpr),*) -> PyExpr { functions::expr_fn::$FUNC($($arg.into()),*).into() } }; @@ -452,17 +681,14 @@ macro_rules! expr_fn { /// /// These functions take a single `Vec` argument using `pyo3(signature = (*args))`. macro_rules! expr_fn_vec { - ($NAME: ident) => { - expr_fn_vec!($NAME, $NAME, stringify!($NAME)); + ($FUNC: ident) => { + expr_fn_vec!($FUNC, stringify!($FUNC)); }; - ($NAME: ident, $DOC: expr) => { - expr_fn_vec!($NAME, $NAME, $DOC); - }; - ($NAME: ident, $FUNC: ident, $DOC: expr) => { + ($FUNC: ident, $DOC: expr) => { #[doc = $DOC] #[pyfunction] #[pyo3(signature = (*args))] - fn $NAME(args: Vec) -> PyExpr { + fn $FUNC(args: Vec) -> PyExpr { let args = args.into_iter().map(|e| e.into()).collect::>(); functions::expr_fn::$FUNC(args).into() } @@ -473,22 +699,19 @@ macro_rules! expr_fn_vec { /// /// These functions have explicit named arguments. macro_rules! array_fn { - ($NAME: ident) => { - array_fn!($NAME, $NAME, , stringify!($NAME)); + ($FUNC: ident) => { + array_fn!($FUNC, , stringify!($FUNC)); }; - ($NAME:ident, $($arg:ident)*) => { - array_fn!($NAME, $NAME, $($arg)*, stringify!($FUNC)); + ($FUNC:ident, $($arg:ident)*) => { + array_fn!($FUNC, $($arg)*, stringify!($FUNC)); }; - ($NAME: ident, $FUNC:ident, $($arg:ident)*) => { - array_fn!($NAME, $FUNC, $($arg)*, stringify!($FUNC)); + ($FUNC: ident, $DOC: expr) => { + array_fn!($FUNC, , $DOC); }; - ($NAME: ident, $DOC: expr) => { - array_fn!($NAME, $NAME, , $DOC); - }; - ($NAME: ident, $FUNC:ident, $($arg:ident)*, $DOC:expr) => { + ($FUNC: ident, $($arg:ident)*, $DOC:expr) => { #[doc = $DOC] #[pyfunction] - fn $NAME($($arg: PyExpr),*) -> PyExpr { + fn $FUNC($($arg: PyExpr),*) -> PyExpr { datafusion_functions_array::expr_fn::$FUNC($($arg.into()),*).into() } }; @@ -559,7 +782,6 @@ expr_fn!(octet_length, args, "Returns number of bytes in the string. Since this expr_fn_vec!(overlay); expr_fn!(pi); expr_fn!(power, base exponent); -expr_fn!(pow, power, base exponent); expr_fn!(radians, num); expr_fn!(repeat, string n, "Repeats string the specified number of times."); expr_fn!( @@ -611,9 +833,7 @@ expr_fn_vec!(to_unixtime); expr_fn!(current_date); expr_fn!(current_time); expr_fn!(date_part, part date); -expr_fn!(datepart, date_part, part date); expr_fn!(date_trunc, part date); -expr_fn!(datetrunc, date_trunc, part date); expr_fn!(date_bin, stride source origin); expr_fn!(make_date, year month day); @@ -630,95 +850,37 @@ expr_fn!(random); // Array Functions array_fn!(array_append, array element); -array_fn!(array_push_back, array_append, array element); array_fn!(array_to_string, array delimiter); -array_fn!(array_join, array_to_string, array delimiter); -array_fn!(list_to_string, array_to_string, array delimiter); -array_fn!(list_join, array_to_string, array delimiter); -array_fn!(list_append, array_append, array element); -array_fn!(list_push_back, array_append, array element); array_fn!(array_dims, array); array_fn!(array_distinct, array); -array_fn!(list_distinct, array_distinct, array); -array_fn!(list_dims, array_dims, array); array_fn!(array_element, array element); -array_fn!(array_extract, array_element, array element); -array_fn!(list_element, array_element, array element); -array_fn!(list_extract, array_element, array element); array_fn!(array_length, array); -array_fn!(list_length, array_length, array); array_fn!(array_has, first_array second_array); array_fn!(array_has_all, first_array second_array); array_fn!(array_has_any, first_array second_array); -array_fn!(array_positions, array_positions, array element); -array_fn!(list_positions, array_positions, array element); +array_fn!(array_positions, array element); array_fn!(array_ndims, array); -array_fn!(list_ndims, array_ndims, array); array_fn!(array_prepend, element array); -array_fn!(array_push_front, array_prepend, element array); -array_fn!(list_prepend, array_prepend, element array); -array_fn!(list_push_front, array_prepend, element array); array_fn!(array_pop_back, array); array_fn!(array_pop_front, array); array_fn!(array_remove, array element); -array_fn!(list_remove, array_remove, array element); array_fn!(array_remove_n, array element max); -array_fn!(list_remove_n, array_remove_n, array element max); array_fn!(array_remove_all, array element); -array_fn!(list_remove_all, array_remove_all, array element); array_fn!(array_repeat, element count); array_fn!(array_replace, array from to); -array_fn!(list_replace, array_replace, array from to); array_fn!(array_replace_n, array from to max); -array_fn!(list_replace_n, array_replace_n, array from to max); array_fn!(array_replace_all, array from to); -array_fn!(list_replace_all, array_replace_all, array from to); array_fn!(array_sort, array desc null_first); -array_fn!(list_sort, array_sort, array desc null_first); array_fn!(array_intersect, first_array second_array); -array_fn!(list_intersect, array_intersect, first_array second_array); array_fn!(array_union, array1 array2); -array_fn!(list_union, array_union, array1 array2); array_fn!(array_except, first_array second_array); -array_fn!(list_except, array_except, first_array second_array); array_fn!(array_resize, array size value); -array_fn!(list_resize, array_resize, array size value); array_fn!(flatten, array); array_fn!(range, start stop step); -aggregate_function!(approx_distinct, ApproxDistinct); -aggregate_function!(approx_median, ApproxMedian); -aggregate_function!(approx_percentile_cont, ApproxPercentileCont); -aggregate_function!( - approx_percentile_cont_with_weight, - ApproxPercentileContWithWeight -); aggregate_function!(array_agg, ArrayAgg); -aggregate_function!(avg, Avg); -aggregate_function!(corr, Correlation); -aggregate_function!(count, Count); -aggregate_function!(grouping, Grouping); aggregate_function!(max, Max); -aggregate_function!(mean, Avg); aggregate_function!(min, Min); -aggregate_function!(stddev, Stddev); -aggregate_function!(stddev_pop, StddevPop); -aggregate_function!(stddev_samp, Stddev); -aggregate_function!(var_pop, VariancePop); -aggregate_function!(regr_avgx, RegrAvgx); -aggregate_function!(regr_avgy, RegrAvgy); -aggregate_function!(regr_count, RegrCount); -aggregate_function!(regr_intercept, RegrIntercept); -aggregate_function!(regr_r2, RegrR2); -aggregate_function!(regr_slope, RegrSlope); -aggregate_function!(regr_sxx, RegrSXX); -aggregate_function!(regr_sxy, RegrSXY); -aggregate_function!(regr_syy, RegrSYY); -aggregate_function!(bit_and, BitAnd); -aggregate_function!(bit_or, BitOr); -aggregate_function!(bit_xor, BitXor); -aggregate_function!(bool_and, BoolAnd); -aggregate_function!(bool_or, BoolOr); pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(abs))?; @@ -729,7 +891,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(approx_median))?; m.add_wrapped(wrap_pyfunction!(approx_percentile_cont))?; m.add_wrapped(wrap_pyfunction!(approx_percentile_cont_with_weight))?; - m.add_wrapped(wrap_pyfunction!(array))?; m.add_wrapped(wrap_pyfunction!(range))?; m.add_wrapped(wrap_pyfunction!(array_agg))?; m.add_wrapped(wrap_pyfunction!(arrow_typeof))?; @@ -758,16 +919,13 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(cot))?; m.add_wrapped(wrap_pyfunction!(count))?; m.add_wrapped(wrap_pyfunction!(count_star))?; - m.add_wrapped(wrap_pyfunction!(covar))?; m.add_wrapped(wrap_pyfunction!(covar_pop))?; m.add_wrapped(wrap_pyfunction!(covar_samp))?; m.add_wrapped(wrap_pyfunction!(current_date))?; m.add_wrapped(wrap_pyfunction!(current_time))?; m.add_wrapped(wrap_pyfunction!(degrees))?; m.add_wrapped(wrap_pyfunction!(date_bin))?; - m.add_wrapped(wrap_pyfunction!(datepart))?; m.add_wrapped(wrap_pyfunction!(date_part))?; - m.add_wrapped(wrap_pyfunction!(datetrunc))?; m.add_wrapped(wrap_pyfunction!(date_trunc))?; m.add_wrapped(wrap_pyfunction!(make_date))?; m.add_wrapped(wrap_pyfunction!(digest))?; @@ -796,7 +954,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(max))?; m.add_wrapped(wrap_pyfunction!(make_array))?; m.add_wrapped(wrap_pyfunction!(md5))?; - m.add_wrapped(wrap_pyfunction!(mean))?; m.add_wrapped(wrap_pyfunction!(median))?; m.add_wrapped(wrap_pyfunction!(min))?; m.add_wrapped(wrap_pyfunction!(named_struct))?; @@ -808,7 +965,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(overlay))?; m.add_wrapped(wrap_pyfunction!(pi))?; m.add_wrapped(wrap_pyfunction!(power))?; - m.add_wrapped(wrap_pyfunction!(pow))?; m.add_wrapped(wrap_pyfunction!(radians))?; m.add_wrapped(wrap_pyfunction!(random))?; m.add_wrapped(wrap_pyfunction!(regexp_like))?; @@ -833,7 +989,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(starts_with))?; m.add_wrapped(wrap_pyfunction!(stddev))?; m.add_wrapped(wrap_pyfunction!(stddev_pop))?; - m.add_wrapped(wrap_pyfunction!(stddev_samp))?; m.add_wrapped(wrap_pyfunction!(strpos))?; m.add_wrapped(wrap_pyfunction!(r#struct))?; // Use raw identifier since struct is a keyword m.add_wrapped(wrap_pyfunction!(substr))?; @@ -854,7 +1009,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(trunc))?; m.add_wrapped(wrap_pyfunction!(upper))?; m.add_wrapped(wrap_pyfunction!(self::uuid))?; // Use self to avoid name collision - m.add_wrapped(wrap_pyfunction!(var))?; m.add_wrapped(wrap_pyfunction!(var_pop))?; m.add_wrapped(wrap_pyfunction!(var_samp))?; m.add_wrapped(wrap_pyfunction!(window))?; @@ -881,67 +1035,35 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { // Array Functions m.add_wrapped(wrap_pyfunction!(array_append))?; - m.add_wrapped(wrap_pyfunction!(array_push_back))?; - m.add_wrapped(wrap_pyfunction!(list_append))?; - m.add_wrapped(wrap_pyfunction!(list_push_back))?; m.add_wrapped(wrap_pyfunction!(array_concat))?; m.add_wrapped(wrap_pyfunction!(array_cat))?; m.add_wrapped(wrap_pyfunction!(array_dims))?; m.add_wrapped(wrap_pyfunction!(array_distinct))?; - m.add_wrapped(wrap_pyfunction!(list_distinct))?; - m.add_wrapped(wrap_pyfunction!(list_dims))?; m.add_wrapped(wrap_pyfunction!(array_element))?; - m.add_wrapped(wrap_pyfunction!(array_extract))?; - m.add_wrapped(wrap_pyfunction!(list_element))?; - m.add_wrapped(wrap_pyfunction!(list_extract))?; m.add_wrapped(wrap_pyfunction!(array_length))?; - m.add_wrapped(wrap_pyfunction!(list_length))?; m.add_wrapped(wrap_pyfunction!(array_has))?; m.add_wrapped(wrap_pyfunction!(array_has_all))?; m.add_wrapped(wrap_pyfunction!(array_has_any))?; m.add_wrapped(wrap_pyfunction!(array_position))?; - m.add_wrapped(wrap_pyfunction!(array_indexof))?; - m.add_wrapped(wrap_pyfunction!(list_position))?; - m.add_wrapped(wrap_pyfunction!(list_indexof))?; m.add_wrapped(wrap_pyfunction!(array_positions))?; - m.add_wrapped(wrap_pyfunction!(list_positions))?; m.add_wrapped(wrap_pyfunction!(array_to_string))?; m.add_wrapped(wrap_pyfunction!(array_intersect))?; - m.add_wrapped(wrap_pyfunction!(list_intersect))?; m.add_wrapped(wrap_pyfunction!(array_union))?; - m.add_wrapped(wrap_pyfunction!(list_union))?; m.add_wrapped(wrap_pyfunction!(array_except))?; - m.add_wrapped(wrap_pyfunction!(list_except))?; m.add_wrapped(wrap_pyfunction!(array_resize))?; - m.add_wrapped(wrap_pyfunction!(list_resize))?; - m.add_wrapped(wrap_pyfunction!(array_join))?; - m.add_wrapped(wrap_pyfunction!(list_to_string))?; - m.add_wrapped(wrap_pyfunction!(list_join))?; m.add_wrapped(wrap_pyfunction!(array_ndims))?; - m.add_wrapped(wrap_pyfunction!(list_ndims))?; m.add_wrapped(wrap_pyfunction!(array_prepend))?; - m.add_wrapped(wrap_pyfunction!(array_push_front))?; - m.add_wrapped(wrap_pyfunction!(list_prepend))?; - m.add_wrapped(wrap_pyfunction!(list_push_front))?; m.add_wrapped(wrap_pyfunction!(array_pop_back))?; m.add_wrapped(wrap_pyfunction!(array_pop_front))?; m.add_wrapped(wrap_pyfunction!(array_remove))?; - m.add_wrapped(wrap_pyfunction!(list_remove))?; m.add_wrapped(wrap_pyfunction!(array_remove_n))?; - m.add_wrapped(wrap_pyfunction!(list_remove_n))?; m.add_wrapped(wrap_pyfunction!(array_remove_all))?; - m.add_wrapped(wrap_pyfunction!(list_remove_all))?; m.add_wrapped(wrap_pyfunction!(array_repeat))?; m.add_wrapped(wrap_pyfunction!(array_replace))?; - m.add_wrapped(wrap_pyfunction!(list_replace))?; m.add_wrapped(wrap_pyfunction!(array_replace_n))?; - m.add_wrapped(wrap_pyfunction!(list_replace_n))?; m.add_wrapped(wrap_pyfunction!(array_replace_all))?; - m.add_wrapped(wrap_pyfunction!(list_replace_all))?; m.add_wrapped(wrap_pyfunction!(array_sort))?; - m.add_wrapped(wrap_pyfunction!(list_sort))?; m.add_wrapped(wrap_pyfunction!(array_slice))?; - m.add_wrapped(wrap_pyfunction!(list_slice))?; m.add_wrapped(wrap_pyfunction!(flatten))?; Ok(()) From 66bfe36d2b6d7f17446a4205aa98172abe439ee2 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 1 Aug 2024 10:53:04 -0400 Subject: [PATCH 005/248] Bugfix: Calling count with None arguments (#768) * Small bugfix. When arguments are None, we should use count_star * When no arguments are given to count, set argument to lit(1) so that it is similar to count_star but still enables you to select distinct=True. --- python/datafusion/functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index ca41f5ff7..0cb5b0443 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1295,6 +1295,8 @@ def corr(value1: Expr, value2: Expr, distinct: bool = False) -> Expr: def count(args: Expr | list[Expr] | None = None, distinct: bool = False) -> Expr: """Returns the number of rows that match the given arguments.""" + if args is None: + return count(Expr.literal(1), distinct=distinct) if isinstance(args, list): args = [arg.expr for arg in args] elif isinstance(args, Expr): From 951d6b92c45c39f0dccd27d179ebdb3ddf78add8 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 1 Aug 2024 10:53:19 -0400 Subject: [PATCH 006/248] Add in user example that compares a two different approaches to UDFs (#770) * Add in user example that compares a two different approaches to UDFs * add license --- examples/python-udf-comparisons.py | 186 +++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 examples/python-udf-comparisons.py diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py new file mode 100644 index 000000000..e2d856749 --- /dev/null +++ b/examples/python-udf-comparisons.py @@ -0,0 +1,186 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from datafusion import SessionContext, col, lit, udf, functions as F +import os +import pyarrow as pa +import pyarrow.compute as pc +import time + +path = os.path.dirname(os.path.abspath(__file__)) +filepath = os.path.join(path, "../tpch/data/lineitem.parquet") + +# This example serves to demonstrate alternate approaches to answering the +# question "return all of the rows that have a specific combination of these +# values". We have the combinations we care about provided as a python +# list of tuples. There is no built in function that supports this operation, +# but it can be explicilty specified via a single expression or we can +# use a user defined function. + +ctx = SessionContext() + +# These part keys and suppliers are chosen because there are +# cases where two suppliers each have two of the part keys +# but we are interested in these specific combinations. + +values_of_interest = [ + (1530, 4031, "N"), + (6530, 1531, "N"), + (5618, 619, "N"), + (8118, 8119, "N"), +] + +partkeys = [lit(r[0]) for r in values_of_interest] +suppkeys = [lit(r[1]) for r in values_of_interest] +returnflags = [lit(r[2]) for r in values_of_interest] + +df_lineitem = ctx.read_parquet(filepath).select( + "l_partkey", "l_suppkey", "l_returnflag" +) + +start_time = time.time() + +df_simple_filter = df_lineitem.filter( + F.in_list(col("l_partkey"), partkeys), + F.in_list(col("l_suppkey"), suppkeys), + F.in_list(col("l_returnflag"), returnflags), +) + +num_rows = df_simple_filter.count() +print( + f"Simple filtering has number {num_rows} rows and took {time.time() - start_time} s" +) +print("This is the incorrect number of rows!") +start_time = time.time() + +# Explicitly check for the combinations of interest. +# This works but is not scalable. + +filter_expr = ( + ( + (col("l_partkey") == values_of_interest[0][0]) + & (col("l_suppkey") == values_of_interest[0][1]) + & (col("l_returnflag") == values_of_interest[0][2]) + ) + | ( + (col("l_partkey") == values_of_interest[1][0]) + & (col("l_suppkey") == values_of_interest[1][1]) + & (col("l_returnflag") == values_of_interest[1][2]) + ) + | ( + (col("l_partkey") == values_of_interest[2][0]) + & (col("l_suppkey") == values_of_interest[2][1]) + & (col("l_returnflag") == values_of_interest[2][2]) + ) + | ( + (col("l_partkey") == values_of_interest[3][0]) + & (col("l_suppkey") == values_of_interest[3][1]) + & (col("l_returnflag") == values_of_interest[3][2]) + ) +) + +df_explicit_filter = df_lineitem.filter(filter_expr) + +num_rows = df_explicit_filter.count() +print( + f"Explicit filtering has number {num_rows} rows and took {time.time() - start_time} s" +) +start_time = time.time() + +# Instead try a python UDF + + +def is_of_interest_impl( + partkey_arr: pa.Array, + suppkey_arr: pa.Array, + returnflag_arr: pa.Array, +) -> pa.Array: + result = [] + for idx, partkey in enumerate(partkey_arr): + partkey = partkey.as_py() + suppkey = suppkey_arr[idx].as_py() + returnflag = returnflag_arr[idx].as_py() + value = (partkey, suppkey, returnflag) + result.append(value in values_of_interest) + + return pa.array(result) + + +is_of_interest = udf( + is_of_interest_impl, + [pa.int32(), pa.int32(), pa.utf8()], + pa.bool_(), + "stable", +) + +df_udf_filter = df_lineitem.filter( + is_of_interest(col("l_partkey"), col("l_suppkey"), col("l_returnflag")) +) + +num_rows = df_udf_filter.count() +print(f"UDF filtering has number {num_rows} rows and took {time.time() - start_time} s") +start_time = time.time() + +# Now use a user defined function but lean on the built in pyarrow array +# functions so we never convert rows to python objects. + +# To see other pyarrow compute functions see +# https://arrow.apache.org/docs/python/api/compute.html +# +# It is important that the number of rows in the returned array +# matches the original array, so we cannot use functions like +# filtered_partkey_arr.filter(filtered_suppkey_arr). + + +def udf_using_pyarrow_compute_impl( + partkey_arr: pa.Array, + suppkey_arr: pa.Array, + returnflag_arr: pa.Array, +) -> pa.Array: + results = None + for partkey, suppkey, returnflag in values_of_interest: + filtered_partkey_arr = pc.equal(partkey_arr, partkey) + filtered_suppkey_arr = pc.equal(suppkey_arr, suppkey) + filtered_returnflag_arr = pc.equal(returnflag_arr, returnflag) + + resultant_arr = pc.and_(filtered_partkey_arr, filtered_suppkey_arr) + resultant_arr = pc.and_(resultant_arr, filtered_returnflag_arr) + + if results is None: + results = resultant_arr + else: + results = pc.or_(results, resultant_arr) + + return results + + +udf_using_pyarrow_compute = udf( + udf_using_pyarrow_compute_impl, + [pa.int32(), pa.int32(), pa.utf8()], + pa.bool_(), + "stable", +) + +df_udf_pyarrow_compute = df_lineitem.filter( + udf_using_pyarrow_compute(col("l_partkey"), col("l_suppkey"), col("l_returnflag")) +) + +num_rows = df_udf_pyarrow_compute.count() +print( + f"UDF filtering using pyarrow compute has number {num_rows} rows and took {time.time() - start_time} s" +) +start_time = time.time() From 9a6805e99f5a4f9b872453ae640c2b4633c9dcbf Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 1 Aug 2024 10:53:38 -0400 Subject: [PATCH 007/248] Add missing exports for wrapper modules (#782) * Add imports in base file to match those in internal * Correct class capitalization and exports for substrait * Add exports for common to match internal * Add exports for Expr to match internal * Add __all__ to functions * Add exports for object store to match internal * Add pytest to ensure all pyo3 exposed objects are also exposed in our wrappers so we don't miss any functions or classes * Add license --- python/datafusion/__init__.py | 22 +- python/datafusion/common.py | 32 ++- python/datafusion/expr.py | 74 ++++++ python/datafusion/functions.py | 221 ++++++++++++++++++ python/datafusion/object_store.py | 12 + python/datafusion/substrait.py | 21 +- .../datafusion/tests/test_wrapper_coverage.py | 49 ++++ src/substrait.rs | 6 +- 8 files changed, 423 insertions(+), 14 deletions(-) create mode 100644 python/datafusion/tests/test_wrapper_coverage.py diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 59bc8e306..0569ac4b0 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -33,8 +33,12 @@ SQLOptions, ) +from .catalog import Catalog, Database, Table + # The following imports are okay to remain as opaque to the user. -from ._internal import Config +from ._internal import Config, LogicalPlan, ExecutionPlan, runtime + +from .record_batch import RecordBatchStream, RecordBatch from .udf import ScalarUDF, AggregateUDF, Accumulator @@ -49,6 +53,8 @@ WindowFrame, ) +from . import functions, object_store, substrait + __version__ = importlib_metadata.version(__name__) __all__ = [ @@ -65,6 +71,20 @@ "column", "literal", "DFSchema", + "runtime", + "Catalog", + "Database", + "Table", + "AggregateUDF", + "LogicalPlan", + "ExecutionPlan", + "RecordBatch", + "RecordBatchStream", + "common", + "expr", + "functions", + "object_store", + "substrait", ] diff --git a/python/datafusion/common.py b/python/datafusion/common.py index 2351845b8..225e33304 100644 --- a/python/datafusion/common.py +++ b/python/datafusion/common.py @@ -16,8 +16,34 @@ # under the License. """Common data types used throughout the DataFusion project.""" -from ._internal import common +from ._internal import common as common_internal +# TODO these should all have proper wrapper classes -def __getattr__(name): - return getattr(common, name) +DFSchema = common_internal.DFSchema +DataType = common_internal.DataType +DataTypeMap = common_internal.DataTypeMap +NullTreatment = common_internal.NullTreatment +PythonType = common_internal.PythonType +RexType = common_internal.RexType +SqlFunction = common_internal.SqlFunction +SqlSchema = common_internal.SqlSchema +SqlStatistics = common_internal.SqlStatistics +SqlTable = common_internal.SqlTable +SqlType = common_internal.SqlType +SqlView = common_internal.SqlView + +__all__ = [ + "DFSchema", + "DataType", + "DataTypeMap", + "RexType", + "PythonType", + "SqlType", + "NullTreatment", + "SqlTable", + "SqlSchema", + "SqlView", + "SqlStatistics", + "SqlFunction", +] diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index c04a525a6..318b8b9ae 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -47,6 +47,7 @@ CrossJoin = expr_internal.CrossJoin Distinct = expr_internal.Distinct DropTable = expr_internal.DropTable +EmptyRelation = expr_internal.EmptyRelation Exists = expr_internal.Exists Explain = expr_internal.Explain Extension = expr_internal.Extension @@ -58,6 +59,7 @@ InSubquery = expr_internal.InSubquery IsFalse = expr_internal.IsFalse IsNotTrue = expr_internal.IsNotTrue +IsNull = expr_internal.IsNull IsTrue = expr_internal.IsTrue IsUnknown = expr_internal.IsUnknown IsNotFalse = expr_internal.IsNotFalse @@ -83,6 +85,70 @@ TableScan = expr_internal.TableScan TryCast = expr_internal.TryCast Union = expr_internal.Union +Unnest = expr_internal.Unnest +Window = expr_internal.Window + +__all__ = [ + "Expr", + "Column", + "Literal", + "BinaryExpr", + "Literal", + "AggregateFunction", + "Not", + "IsNotNull", + "IsNull", + "IsTrue", + "IsFalse", + "IsUnknown", + "IsNotTrue", + "IsNotFalse", + "IsNotUnknown", + "Negative", + "Like", + "ILike", + "SimilarTo", + "ScalarVariable", + "Alias", + "InList", + "Exists", + "Subquery", + "InSubquery", + "ScalarSubquery", + "Placeholder", + "GroupingSet", + "Case", + "CaseBuilder", + "Cast", + "TryCast", + "Between", + "Explain", + "Limit", + "Aggregate", + "Sort", + "Analyze", + "EmptyRelation", + "Join", + "JoinType", + "JoinConstraint", + "CrossJoin", + "Union", + "Unnest", + "Extension", + "Filter", + "Projection", + "TableScan", + "CreateMemoryTable", + "CreateView", + "Distinct", + "SubqueryAlias", + "DropTable", + "Partitioning", + "Repartition", + "Window", + "WindowFrame", + "WindowFrameBound", +] class Expr: @@ -246,6 +312,14 @@ def __lt__(self, rhs: Any) -> Expr: rhs = Expr.literal(rhs) return Expr(self.expr.__lt__(rhs.expr)) + __radd__ = __add__ + __rand__ = __and__ + __rmod__ = __mod__ + __rmul__ = __mul__ + __ror__ = __or__ + __rsub__ = __sub__ + __rtruediv__ = __truediv__ + @staticmethod def literal(value: Any) -> Expr: """Creates a new expression representing a scalar value. diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 0cb5b0443..be83d359f 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -27,6 +27,227 @@ from datafusion.expr import CaseBuilder, Expr, WindowFrame from datafusion.context import SessionContext +__all__ = [ + "abs", + "acos", + "acosh", + "alias", + "approx_distinct", + "approx_median", + "approx_percentile_cont", + "approx_percentile_cont_with_weight", + "array", + "array_agg", + "array_append", + "array_cat", + "array_concat", + "array_dims", + "array_distinct", + "array_element", + "array_except", + "array_extract", + "array_has", + "array_has_all", + "array_has_any", + "array_indexof", + "array_intersect", + "array_join", + "array_length", + "array_ndims", + "array_pop_back", + "array_pop_front", + "array_position", + "array_positions", + "array_prepend", + "array_push_back", + "array_push_front", + "array_remove", + "array_remove_all", + "array_remove_n", + "array_repeat", + "array_replace", + "array_replace_all", + "array_replace_n", + "array_resize", + "array_slice", + "array_sort", + "array_to_string", + "array_union", + "arrow_typeof", + "ascii", + "asin", + "asinh", + "atan", + "atan2", + "atanh", + "avg", + "bit_and", + "bit_length", + "bit_or", + "bit_xor", + "bool_and", + "bool_or", + "btrim", + "case", + "cbrt", + "ceil", + "char_length", + "character_length", + "chr", + "coalesce", + "col", + "concat", + "concat_ws", + "corr", + "cos", + "cosh", + "cot", + "count", + "count_star", + "covar", + "covar_pop", + "covar_samp", + "current_date", + "current_time", + "date_bin", + "date_part", + "date_trunc", + "datepart", + "datetrunc", + "decode", + "degrees", + "digest", + "encode", + "ends_with", + "exp", + "factorial", + "find_in_set", + "first_value", + "flatten", + "floor", + "from_unixtime", + "gcd", + "grouping", + "in_list", + "initcap", + "isnan", + "iszero", + "last_value", + "lcm", + "left", + "length", + "levenshtein", + "list_append", + "list_dims", + "list_distinct", + "list_element", + "list_except", + "list_extract", + "list_indexof", + "list_intersect", + "list_join", + "list_length", + "list_ndims", + "list_position", + "list_positions", + "list_prepend", + "list_push_back", + "list_push_front", + "list_remove", + "list_remove_all", + "list_remove_n", + "list_replace", + "list_replace_all", + "list_replace_n", + "list_resize", + "list_slice", + "list_sort", + "list_to_string", + "list_union", + "ln", + "log", + "log10", + "log2", + "lower", + "lpad", + "ltrim", + "make_array", + "make_date", + "max", + "md5", + "mean", + "median", + "min", + "named_struct", + "nanvl", + "now", + "nullif", + "octet_length", + "order_by", + "overlay", + "pi", + "pow", + "power", + "radians", + "random", + "range", + "regexp_like", + "regexp_match", + "regexp_replace", + "regr_avgx", + "regr_avgy", + "regr_count", + "regr_intercept", + "regr_r2", + "regr_slope", + "regr_sxx", + "regr_sxy", + "regr_syy", + "repeat", + "replace", + "reverse", + "right", + "round", + "rpad", + "rtrim", + "sha224", + "sha256", + "sha384", + "sha512", + "signum", + "sin", + "sinh", + "split_part", + "sqrt", + "starts_with", + "stddev", + "stddev_pop", + "stddev_samp", + "strpos", + "struct", + "substr", + "substr_index", + "substring", + "sum", + "tan", + "tanh", + "to_hex", + "to_timestamp", + "to_timestamp_micros", + "to_timestamp_millis", + "to_timestamp_seconds", + "to_unixtime", + "translate", + "trim", + "trunc", + "upper", + "uuid", + "var", + "var_pop", + "var_samp", + "window", +] + def isnan(expr: Expr) -> Expr: """Returns true if a given number is +NaN or -NaN otherwise returns false.""" diff --git a/python/datafusion/object_store.py b/python/datafusion/object_store.py index a9bb83d29..c927e7614 100644 --- a/python/datafusion/object_store.py +++ b/python/datafusion/object_store.py @@ -18,6 +18,18 @@ from ._internal import object_store +AmazonS3 = object_store.AmazonS3 +GoogleCloud = object_store.GoogleCloud +LocalFileSystem = object_store.LocalFileSystem +MicrosoftAzure = object_store.MicrosoftAzure + +__all__ = [ + "AmazonS3", + "GoogleCloud", + "LocalFileSystem", + "MicrosoftAzure", +] + def __getattr__(name): return getattr(object_store, name) diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index a199dd733..4b44ad19b 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -33,6 +33,13 @@ from datafusion.context import SessionContext from datafusion._internal import LogicalPlan +__all__ = [ + "Plan", + "Consumer", + "Producer", + "Serde", +] + class Plan: """A class representing an encodable substrait plan.""" @@ -73,7 +80,7 @@ def serialize(sql: str, ctx: SessionContext, path: str | pathlib.Path) -> None: ctx: SessionContext to use. path: Path to write the Substrait plan to. """ - return substrait_internal.serde.serialize(sql, ctx.ctx, str(path)) + return substrait_internal.Serde.serialize(sql, ctx.ctx, str(path)) @staticmethod def serialize_to_plan(sql: str, ctx: SessionContext) -> Plan: @@ -86,7 +93,7 @@ def serialize_to_plan(sql: str, ctx: SessionContext) -> Plan: Returns: Substrait plan. """ - return Plan(substrait_internal.serde.serialize_to_plan(sql, ctx.ctx)) + return Plan(substrait_internal.Serde.serialize_to_plan(sql, ctx.ctx)) @staticmethod def serialize_bytes(sql: str, ctx: SessionContext) -> bytes: @@ -99,7 +106,7 @@ def serialize_bytes(sql: str, ctx: SessionContext) -> bytes: Returns: Substrait plan as bytes. """ - return substrait_internal.serde.serialize_bytes(sql, ctx.ctx) + return substrait_internal.Serde.serialize_bytes(sql, ctx.ctx) @staticmethod def deserialize(path: str | pathlib.Path) -> Plan: @@ -111,7 +118,7 @@ def deserialize(path: str | pathlib.Path) -> Plan: Returns: Substrait plan. """ - return Plan(substrait_internal.serde.deserialize(str(path))) + return Plan(substrait_internal.Serde.deserialize(str(path))) @staticmethod def deserialize_bytes(proto_bytes: bytes) -> Plan: @@ -123,7 +130,7 @@ def deserialize_bytes(proto_bytes: bytes) -> Plan: Returns: Substrait plan. """ - return Plan(substrait_internal.serde.deserialize_bytes(proto_bytes)) + return Plan(substrait_internal.Serde.deserialize_bytes(proto_bytes)) @deprecated("Use `Serde` instead.") @@ -148,7 +155,7 @@ def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan: Substrait plan. """ return Plan( - substrait_internal.producer.to_substrait_plan(logical_plan, ctx.ctx) + substrait_internal.Producer.to_substrait_plan(logical_plan, ctx.ctx) ) @@ -173,7 +180,7 @@ def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan: Returns: LogicalPlan. """ - return substrait_internal.consumer.from_substrait_plan( + return substrait_internal.Consumer.from_substrait_plan( ctx.ctx, plan.plan_internal ) diff --git a/python/datafusion/tests/test_wrapper_coverage.py b/python/datafusion/tests/test_wrapper_coverage.py new file mode 100644 index 000000000..44b9ca831 --- /dev/null +++ b/python/datafusion/tests/test_wrapper_coverage.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datafusion +import datafusion.functions +import datafusion.object_store +import datafusion.substrait + + +def missing_exports(internal_obj, wrapped_obj) -> None: + for attr in dir(internal_obj): + assert attr in dir(wrapped_obj) + + internal_attr = getattr(internal_obj, attr) + wrapped_attr = getattr(wrapped_obj, attr) + + assert wrapped_attr is not None if internal_attr is not None else True + + if attr in ["__self__", "__class__"]: + continue + if isinstance(internal_attr, list): + assert isinstance(wrapped_attr, list) + for val in internal_attr: + assert val in wrapped_attr + elif hasattr(internal_attr, "__dict__"): + missing_exports(internal_attr, wrapped_attr) + + +def test_datafusion_missing_exports() -> None: + """Check for any missing pythone exports. + + This test verifies that every exposed class, attribute, and function in + the internal (pyo3) module is also exposed in our python wrappers. + """ + missing_exports(datafusion._internal, datafusion) diff --git a/src/substrait.rs b/src/substrait.rs index 60a523800..f89b6b093 100644 --- a/src/substrait.rs +++ b/src/substrait.rs @@ -59,7 +59,7 @@ impl From for PyPlan { /// A PySubstraitSerializer is a representation of a Serializer that is capable of both serializing /// a `LogicalPlan` instance to Substrait Protobuf bytes and also deserialize Substrait Protobuf bytes /// to a valid `LogicalPlan` instance. -#[pyclass(name = "serde", module = "datafusion.substrait", subclass)] +#[pyclass(name = "Serde", module = "datafusion.substrait", subclass)] #[derive(Debug, Clone)] pub struct PySubstraitSerializer; @@ -105,7 +105,7 @@ impl PySubstraitSerializer { } } -#[pyclass(name = "producer", module = "datafusion.substrait", subclass)] +#[pyclass(name = "Producer", module = "datafusion.substrait", subclass)] #[derive(Debug, Clone)] pub struct PySubstraitProducer; @@ -121,7 +121,7 @@ impl PySubstraitProducer { } } -#[pyclass(name = "consumer", module = "datafusion.substrait", subclass)] +#[pyclass(name = "Consumer", module = "datafusion.substrait", subclass)] #[derive(Debug, Clone)] pub struct PySubstraitConsumer; From 3eb198b5171ebdd41a0279d1da059544429945ca Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Sun, 4 Aug 2024 18:07:34 -0500 Subject: [PATCH 008/248] Add PyExpr to_variant conversions (#793) * make PyExpr::to_variant arms explicit * update PyInList to wrap expr::InList * update PyExists to wrap expr::Exists * update PyInSubquery to wrap expr::InSubquery * update Placeholder to wrap expr::Placeholder * make PyLogicalPlan::to_variant match arms explicit * add PySortExpr wrapper * add PyUnnestExpr wrapper * update PyAlias to wrap upstream Alias * return not implemented error for unimplemnted variants in PyExpr::to_variant * added to_variant python test from the GH issue * remove unused import * return unsupported_variants for unimplemented variants in PyLogicalPlan::to_variant --- python/datafusion/tests/test_expr.py | 28 +++++++++++ src/expr.rs | 51 +++++++++++++++++--- src/expr/alias.rs | 32 +++++++------ src/expr/exists.rs | 15 +++--- src/expr/in_list.rs | 22 ++++----- src/expr/in_subquery.rs | 22 ++++----- src/expr/placeholder.rs | 21 ++++---- src/expr/sort_expr.rs | 71 ++++++++++++++++++++++++++++ src/expr/unnest_expr.rs | 67 ++++++++++++++++++++++++++ src/sql/logical.rs | 15 ++++-- 10 files changed, 273 insertions(+), 71 deletions(-) create mode 100644 src/expr/sort_expr.rs create mode 100644 src/expr/unnest_expr.rs diff --git a/python/datafusion/tests/test_expr.py b/python/datafusion/tests/test_expr.py index c9f0e98d5..1a41120a5 100644 --- a/python/datafusion/tests/test_expr.py +++ b/python/datafusion/tests/test_expr.py @@ -139,3 +139,31 @@ def test_relational_expr(test_ctx): assert df.filter(col("b") != "beta").count() == 2 assert df.filter(col("a") == "beta").count() == 0 + + +def test_expr_to_variant(): + # Taken from https://github.com/apache/datafusion-python/issues/781 + from datafusion import SessionContext + from datafusion.expr import Filter + + + def traverse_logical_plan(plan): + cur_node = plan.to_variant() + if isinstance(cur_node, Filter): + return cur_node.predicate().to_variant() + if hasattr(plan, 'inputs'): + for input_plan in plan.inputs(): + res = traverse_logical_plan(input_plan) + if res is not None: + return res + + ctx = SessionContext() + data = {'id': [1, 2, 3], 'name': ['Alice', 'Bob', 'Charlie']} + ctx.from_pydict(data, name='table1') + query = "SELECT * FROM table1 t1 WHERE t1.name IN ('dfa', 'ad', 'dfre', 'vsa')" + logical_plan = ctx.sql(query).optimized_logical_plan() + variant = traverse_logical_plan(logical_plan) + assert variant is not None + assert variant.expr().to_variant().qualified_name() == 'table1.name' + assert str(variant.list()) == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' + assert not variant.negated() diff --git a/src/expr.rs b/src/expr.rs index aab0daa6f..04bfc85c2 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -33,7 +33,7 @@ use datafusion_expr::{ }; use crate::common::data_type::{DataTypeMap, RexType}; -use crate::errors::{py_runtime_err, py_type_err, DataFusionError}; +use crate::errors::{py_runtime_err, py_type_err, py_unsupported_variant_err, DataFusionError}; use crate::expr::aggregate_expr::PyAggregateFunction; use crate::expr::binary_expr::PyBinaryExpr; use crate::expr::column::PyColumn; @@ -84,11 +84,13 @@ pub mod scalar_subquery; pub mod scalar_variable; pub mod signature; pub mod sort; +pub mod sort_expr; pub mod subquery; pub mod subquery_alias; pub mod table_scan; pub mod union; pub mod unnest; +pub mod unnest_expr; pub mod window; /// A PyExpr that can be used on a DataFrame @@ -119,8 +121,9 @@ pub fn py_expr_list(expr: &[Expr]) -> PyResult> { impl PyExpr { /// Return the specific expression fn to_variant(&self, py: Python) -> PyResult { - Python::with_gil(|_| match &self.expr { - Expr::Alias(alias) => Ok(PyAlias::new(&alias.expr, &alias.name).into_py(py)), + Python::with_gil(|_| { + match &self.expr { + Expr::Alias(alias) => Ok(PyAlias::from(alias.clone()).into_py(py)), Expr::Column(col) => Ok(PyColumn::from(col.clone()).into_py(py)), Expr::ScalarVariable(data_type, variables) => { Ok(PyScalarVariable::new(data_type, variables).into_py(py)) @@ -141,10 +144,44 @@ impl PyExpr { Expr::AggregateFunction(expr) => { Ok(PyAggregateFunction::from(expr.clone()).into_py(py)) } - other => Err(py_runtime_err(format!( - "Cannot convert this Expr to a Python object: {:?}", - other + Expr::SimilarTo(value) => Ok(PySimilarTo::from(value.clone()).into_py(py)), + Expr::Between(value) => Ok(between::PyBetween::from(value.clone()).into_py(py)), + Expr::Case(value) => Ok(case::PyCase::from(value.clone()).into_py(py)), + Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_py(py)), + Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_py(py)), + Expr::Sort(value) => Ok(sort_expr::PySortExpr::from(value.clone()).into_py(py)), + Expr::ScalarFunction(value) => Err(py_unsupported_variant_err(format!( + "Converting Expr::ScalarFunction to a Python object is not implemented: {:?}", + value ))), + Expr::WindowFunction(value) => Err(py_unsupported_variant_err(format!( + "Converting Expr::WindowFunction to a Python object is not implemented: {:?}", + value + ))), + Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_py(py)), + Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_py(py)), + Expr::InSubquery(value) => { + Ok(in_subquery::PyInSubquery::from(value.clone()).into_py(py)) + } + Expr::ScalarSubquery(value) => { + Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_py(py)) + } + Expr::Wildcard { qualifier } => Err(py_unsupported_variant_err(format!( + "Converting Expr::Wildcard to a Python object is not implemented : {:?}", + qualifier + ))), + Expr::GroupingSet(value) => { + Ok(grouping_set::PyGroupingSet::from(value.clone()).into_py(py)) + } + Expr::Placeholder(value) => { + Ok(placeholder::PyPlaceholder::from(value.clone()).into_py(py)) + } + Expr::OuterReferenceColumn(data_type, column) => Err(py_unsupported_variant_err(format!( + "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {:?} - {:?}", + data_type, column + ))), + Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_py(py)), + } }) } @@ -599,6 +636,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -606,6 +644,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/src/expr/alias.rs b/src/expr/alias.rs index 2ce656342..3208800ad 100644 --- a/src/expr/alias.rs +++ b/src/expr/alias.rs @@ -19,13 +19,24 @@ use crate::expr::PyExpr; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; -use datafusion_expr::Expr; +use datafusion_expr::expr::Alias; #[pyclass(name = "Alias", module = "datafusion.expr", subclass)] #[derive(Clone)] pub struct PyAlias { - expr: PyExpr, - alias_name: String, + alias: Alias, +} + +impl From for PyAlias { + fn from(alias: Alias) -> Self { + Self { alias } + } +} + +impl From for Alias { + fn from(py_alias: PyAlias) -> Self { + py_alias.alias + } } impl Display for PyAlias { @@ -35,29 +46,20 @@ impl Display for PyAlias { "Alias \nExpr: `{:?}` \nAlias Name: `{}`", - &self.expr, &self.alias_name + &self.alias.expr, &self.alias.name ) } } -impl PyAlias { - pub fn new(expr: &Expr, alias_name: &String) -> Self { - Self { - expr: expr.clone().into(), - alias_name: alias_name.to_owned(), - } - } -} - #[pymethods] impl PyAlias { /// Retrieve the "name" of the alias fn alias(&self) -> PyResult { - Ok(self.alias_name.clone()) + Ok(self.alias.name.clone()) } fn expr(&self) -> PyResult { - Ok(self.expr.clone()) + Ok((*self.alias.expr.clone()).into()) } /// Get a String representation of this column diff --git a/src/expr/exists.rs b/src/expr/exists.rs index 7df9a6e81..fd2aa8c2f 100644 --- a/src/expr/exists.rs +++ b/src/expr/exists.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::Subquery; +use datafusion_expr::expr::Exists; use pyo3::prelude::*; use super::subquery::PySubquery; @@ -23,23 +23,22 @@ use super::subquery::PySubquery; #[pyclass(name = "Exists", module = "datafusion.expr", subclass)] #[derive(Clone)] pub struct PyExists { - subquery: Subquery, - negated: bool, + exists: Exists, } -impl PyExists { - pub fn new(subquery: Subquery, negated: bool) -> Self { - Self { subquery, negated } +impl From for PyExists { + fn from(exists: Exists) -> Self { + PyExists { exists } } } #[pymethods] impl PyExists { fn subquery(&self) -> PySubquery { - self.subquery.clone().into() + self.exists.subquery.clone().into() } fn negated(&self) -> bool { - self.negated + self.exists.negated } } diff --git a/src/expr/in_list.rs b/src/expr/in_list.rs index 840eee2ce..c1a99a3c8 100644 --- a/src/expr/in_list.rs +++ b/src/expr/in_list.rs @@ -16,38 +16,32 @@ // under the License. use crate::expr::PyExpr; -use datafusion_expr::Expr; +use datafusion_expr::expr::InList; use pyo3::prelude::*; #[pyclass(name = "InList", module = "datafusion.expr", subclass)] #[derive(Clone)] pub struct PyInList { - expr: Box, - list: Vec, - negated: bool, + in_list: InList, } -impl PyInList { - pub fn new(expr: Box, list: Vec, negated: bool) -> Self { - Self { - expr, - list, - negated, - } +impl From for PyInList { + fn from(in_list: InList) -> Self { + PyInList { in_list } } } #[pymethods] impl PyInList { fn expr(&self) -> PyExpr { - (*self.expr).clone().into() + (*self.in_list.expr).clone().into() } fn list(&self) -> Vec { - self.list.iter().map(|e| e.clone().into()).collect() + self.in_list.list.iter().map(|e| e.clone().into()).collect() } fn negated(&self) -> bool { - self.negated + self.in_list.negated } } diff --git a/src/expr/in_subquery.rs b/src/expr/in_subquery.rs index 6cee4a1f0..7dfafdbf0 100644 --- a/src/expr/in_subquery.rs +++ b/src/expr/in_subquery.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::{Expr, Subquery}; +use datafusion_expr::expr::InSubquery; use pyo3::prelude::*; use super::{subquery::PySubquery, PyExpr}; @@ -23,32 +23,26 @@ use super::{subquery::PySubquery, PyExpr}; #[pyclass(name = "InSubquery", module = "datafusion.expr", subclass)] #[derive(Clone)] pub struct PyInSubquery { - expr: Box, - subquery: Subquery, - negated: bool, + in_subquery: InSubquery, } -impl PyInSubquery { - pub fn new(expr: Box, subquery: Subquery, negated: bool) -> Self { - Self { - expr, - subquery, - negated, - } +impl From for PyInSubquery { + fn from(in_subquery: InSubquery) -> Self { + PyInSubquery { in_subquery } } } #[pymethods] impl PyInSubquery { fn expr(&self) -> PyExpr { - (*self.expr).clone().into() + (*self.in_subquery.expr).clone().into() } fn subquery(&self) -> PySubquery { - self.subquery.clone().into() + self.in_subquery.subquery.clone().into() } fn negated(&self) -> bool { - self.negated + self.in_subquery.negated } } diff --git a/src/expr/placeholder.rs b/src/expr/placeholder.rs index e37c8b561..ca75ce37e 100644 --- a/src/expr/placeholder.rs +++ b/src/expr/placeholder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion::arrow::datatypes::DataType; +use datafusion_expr::expr::Placeholder; use pyo3::prelude::*; use crate::common::data_type::PyDataType; @@ -23,26 +23,25 @@ use crate::common::data_type::PyDataType; #[pyclass(name = "Placeholder", module = "datafusion.expr", subclass)] #[derive(Clone)] pub struct PyPlaceholder { - id: String, - data_type: Option, + placeholder: Placeholder, } -impl PyPlaceholder { - pub fn new(id: String, data_type: DataType) -> Self { - Self { - id, - data_type: Some(data_type), - } +impl From for PyPlaceholder { + fn from(placeholder: Placeholder) -> Self { + PyPlaceholder { placeholder } } } #[pymethods] impl PyPlaceholder { fn id(&self) -> String { - self.id.clone() + self.placeholder.id.clone() } fn data_type(&self) -> Option { - self.data_type.as_ref().map(|e| e.clone().into()) + self.placeholder + .data_type + .as_ref() + .map(|e| e.clone().into()) } } diff --git a/src/expr/sort_expr.rs b/src/expr/sort_expr.rs new file mode 100644 index 000000000..6a8a0cf0c --- /dev/null +++ b/src/expr/sort_expr.rs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::expr::PyExpr; +use datafusion_expr::SortExpr; +use pyo3::prelude::*; +use std::fmt::{self, Display, Formatter}; + +#[pyclass(name = "SortExpr", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PySortExpr { + sort: SortExpr, +} + +impl From for SortExpr { + fn from(sort: PySortExpr) -> Self { + sort.sort + } +} + +impl From for PySortExpr { + fn from(sort: SortExpr) -> PySortExpr { + PySortExpr { sort } + } +} + +impl Display for PySortExpr { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "Sort + Expr: {:?} + Asc: {:?} + NullsFirst: {:?}", + &self.sort.expr, &self.sort.asc, &self.sort.nulls_first + ) + } +} + +#[pymethods] +impl PySortExpr { + fn expr(&self) -> PyResult { + Ok((*self.sort.expr).clone().into()) + } + + fn ascending(&self) -> PyResult { + Ok(self.sort.asc) + } + + fn nulls_first(&self) -> PyResult { + Ok(self.sort.nulls_first) + } + + fn __repr__(&self) -> String { + format!("{}", self) + } +} diff --git a/src/expr/unnest_expr.rs b/src/expr/unnest_expr.rs new file mode 100644 index 000000000..a2f8230cc --- /dev/null +++ b/src/expr/unnest_expr.rs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_expr::expr::Unnest; +use pyo3::prelude::*; +use std::fmt::{self, Display, Formatter}; + +use super::PyExpr; + +#[pyclass(name = "UnnestExpr", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyUnnestExpr { + unnest: Unnest, +} + +impl From for PyUnnestExpr { + fn from(unnest: Unnest) -> PyUnnestExpr { + PyUnnestExpr { unnest } + } +} + +impl From for Unnest { + fn from(unnest: PyUnnestExpr) -> Self { + unnest.unnest + } +} + +impl Display for PyUnnestExpr { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "Unnest + Expr: {:?}", + &self.unnest.expr, + ) + } +} + +#[pymethods] +impl PyUnnestExpr { + /// Retrieves the expression that is being unnested + fn expr(&self) -> PyResult { + Ok((*self.unnest.expr).clone().into()) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("UnnestExpr({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("UnnestExpr".to_string()) + } +} diff --git a/src/sql/logical.rs b/src/sql/logical.rs index b1446b92a..c4471f503 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -81,9 +81,18 @@ impl PyLogicalPlan { LogicalPlan::SubqueryAlias(plan) => PySubqueryAlias::from(plan.clone()).to_variant(py), LogicalPlan::Unnest(plan) => PyUnnest::from(plan.clone()).to_variant(py), LogicalPlan::Window(plan) => PyWindow::from(plan.clone()).to_variant(py), - other => Err(py_unsupported_variant_err(format!( - "Cannot convert this plan to a LogicalNode: {:?}", - other + LogicalPlan::Repartition(_) + | LogicalPlan::Union(_) + | LogicalPlan::Statement(_) + | LogicalPlan::Values(_) + | LogicalPlan::Prepare(_) + | LogicalPlan::Dml(_) + | LogicalPlan::Ddl(_) + | LogicalPlan::Copy(_) + | LogicalPlan::DescribeTable(_) + | LogicalPlan::RecursiveQuery(_) => Err(py_unsupported_variant_err(format!( + "Conversion of variant not implemented: {:?}", + self.plan ))), } } From 1d615482ff42960dfa6987de7d3572e393aa235c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 5 Aug 2024 15:26:58 -0400 Subject: [PATCH 009/248] Add missing expressions to wrapper export (#795) --- python/datafusion/expr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 318b8b9ae..955c4e736 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -80,12 +80,14 @@ ScalarVariable = expr_internal.ScalarVariable SimilarTo = expr_internal.SimilarTo Sort = expr_internal.Sort +SortExpr = expr_internal.SortExpr Subquery = expr_internal.Subquery SubqueryAlias = expr_internal.SubqueryAlias TableScan = expr_internal.TableScan TryCast = expr_internal.TryCast Union = expr_internal.Union Unnest = expr_internal.Unnest +UnnestExpr = expr_internal.UnnestExpr Window = expr_internal.Window __all__ = [ @@ -126,6 +128,7 @@ "Limit", "Aggregate", "Sort", + "SortExpr", "Analyze", "EmptyRelation", "Join", @@ -134,6 +137,7 @@ "CrossJoin", "Union", "Unnest", + "UnnestExpr", "Extension", "Filter", "Projection", From bd0e82088b8504b4f3683e085601e3bdd6f3aaa0 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 6 Aug 2024 10:30:32 -0400 Subject: [PATCH 010/248] Doc/cross reference (#791) * Update docstrings so that cross references work in online docs. Also switch from autosummary to autoapi in sphinx for building API reference documents * Update documentation to cross reference * Correct class names and internal attr * Revert changes that will end up coming in via PR #782 * Add autoapi to requirements file * Add git ignore for files retrieved during local site building * Remove unused portions of doc config * Reset substrait capitalization that was reverted during rebase * Small example changes --- docs/.gitignore | 2 + docs/requirements.txt | 3 +- docs/source/api.rst | 31 --- docs/source/api/dataframe.rst | 27 --- docs/source/api/execution_context.rst | 29 --- docs/source/api/expression.rst | 27 --- docs/source/api/functions.rst | 27 --- docs/source/api/object_store.rst | 27 --- docs/source/conf.py | 56 +++-- docs/source/index.rst | 2 - docs/source/user-guide/basics.rst | 14 +- .../common-operations/aggregations.rst | 2 +- .../common-operations/basic-info.rst | 8 +- .../common-operations/expressions.rst | 12 +- .../common-operations/functions.rst | 14 +- .../user-guide/common-operations/joins.rst | 4 +- .../common-operations/select-and-filter.rst | 10 +- .../common-operations/udf-and-udfa.rst | 4 +- .../user-guide/common-operations/windows.rst | 6 +- docs/source/user-guide/configuration.rst | 8 +- docs/source/user-guide/io/avro.rst | 2 +- docs/source/user-guide/io/csv.rst | 6 +- docs/source/user-guide/io/json.rst | 2 +- docs/source/user-guide/io/parquet.rst | 6 +- examples/python-udf-comparisons.py | 6 +- python/datafusion/__init__.py | 11 +- python/datafusion/catalog.py | 4 +- python/datafusion/context.py | 134 ++++++----- python/datafusion/dataframe.py | 109 +++++---- python/datafusion/expr.py | 23 +- python/datafusion/functions.py | 218 +++++++++--------- python/datafusion/record_batch.py | 14 +- python/datafusion/substrait.py | 3 +- python/datafusion/udf.py | 36 +-- 34 files changed, 370 insertions(+), 517 deletions(-) create mode 100644 docs/.gitignore delete mode 100644 docs/source/api.rst delete mode 100644 docs/source/api/dataframe.rst delete mode 100644 docs/source/api/execution_context.rst delete mode 100644 docs/source/api/expression.rst delete mode 100644 docs/source/api/functions.rst delete mode 100644 docs/source/api/object_store.rst diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 000000000..41e135341 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +pokemon.csv +yellow_trip_data.parquet diff --git a/docs/requirements.txt b/docs/requirements.txt index 42bc4e517..f5cece78e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -22,4 +22,5 @@ maturin jinja2 ipython pandas -pickleshare \ No newline at end of file +pickleshare +sphinx-autoapi diff --git a/docs/source/api.rst b/docs/source/api.rst deleted file mode 100644 index d9f4a09dd..000000000 --- a/docs/source/api.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api: - -************* -API Reference -************* - -.. toctree:: - :maxdepth: 2 - - api/dataframe - api/execution_context - api/expression - api/functions - api/object_store diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst deleted file mode 100644 index 0a3c4c8b1..000000000 --- a/docs/source/api/dataframe.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.dataframe: -.. currentmodule:: datafusion - -DataFrame -========= - -.. autosummary:: - :toctree: ../generated/ - - DataFrame diff --git a/docs/source/api/execution_context.rst b/docs/source/api/execution_context.rst deleted file mode 100644 index a3bda76d7..000000000 --- a/docs/source/api/execution_context.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.execution_context: -.. currentmodule:: datafusion - -SessionContext -============== - -.. autosummary:: - :toctree: ../generated/ - - SessionConfig - RuntimeConfig - SessionContext diff --git a/docs/source/api/expression.rst b/docs/source/api/expression.rst deleted file mode 100644 index 30137d135..000000000 --- a/docs/source/api/expression.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.expression: -.. currentmodule:: datafusion - -Expr -========== - -.. autosummary:: - :toctree: ../generated/ - - Expr diff --git a/docs/source/api/functions.rst b/docs/source/api/functions.rst deleted file mode 100644 index 6f10d826e..000000000 --- a/docs/source/api/functions.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.functions: -.. currentmodule:: datafusion - -Functions -========= - -.. autosummary:: - :toctree: ../generated/ - - functions diff --git a/docs/source/api/object_store.rst b/docs/source/api/object_store.rst deleted file mode 100644 index 8d78f0724..000000000 --- a/docs/source/api/object_store.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.object_store: -.. currentmodule:: datafusion.object_store - -ObjectStore -=========== - -.. autosummary:: - :toctree: ../generated/ - - object_store \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 308069b6c..d5084551e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -46,15 +46,11 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.doctest", - "sphinx.ext.ifconfig", "sphinx.ext.mathjax", - "sphinx.ext.viewcode", "sphinx.ext.napoleon", "myst_parser", "IPython.sphinxext.ipython_directive", + "autoapi.extension", ] source_suffix = { @@ -70,33 +66,35 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] -# Show members for classes in .. autosummary -autodoc_default_options = { - "members": None, - "undoc-members": None, - "show-inheritance": None, - "inherited-members": None, -} - -autosummary_generate = True - +autoapi_dirs = ["../../python"] +autoapi_ignore = ["*tests*"] +autoapi_member_order = "groupwise" +suppress_warnings = ["autoapi.python_import_resolution"] +autoapi_python_class_content = "both" -def autodoc_skip_member(app, what, name, obj, skip, options): - exclude_functions = "__init__" - exclude_classes = ("Expr", "DataFrame") - class_name = "" - if hasattr(obj, "__qualname__"): - if obj.__qualname__ is not None: - class_name = obj.__qualname__.split(".")[0] +def autoapi_skip_member_fn(app, what, name, obj, skip, options): + skip_contents = [ + # Re-exports + ("class", "datafusion.DataFrame"), + ("class", "datafusion.SessionContext"), + ("module", "datafusion.common"), + # Deprecated + ("class", "datafusion.substrait.serde"), + ("class", "datafusion.substrait.plan"), + ("class", "datafusion.substrait.producer"), + ("class", "datafusion.substrait.consumer"), + ("method", "datafusion.context.SessionContext.tables"), + ("method", "datafusion.dataframe.DataFrame.unnest_column"), + ] + if (what, name) in skip_contents: + skip = True - should_exclude = name in exclude_functions and class_name in exclude_classes + return skip - return True if should_exclude else None - -def setup(app): - app.connect("autodoc-skip-member", autodoc_skip_member) +def setup(sphinx): + sphinx.connect("autoapi-skip-member", autoapi_skip_member_fn) # -- Options for HTML output ------------------------------------------------- @@ -106,9 +104,7 @@ def setup(app): # html_theme = "pydata_sphinx_theme" -html_theme_options = { - "use_edit_page_button": True, -} +html_theme_options = {"use_edit_page_button": False, "show_toc_level": 2} html_context = { "github_user": "apache", diff --git a/docs/source/index.rst b/docs/source/index.rst index 16c88e033..b0103a336 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -104,5 +104,3 @@ Example :hidden: :maxdepth: 1 :caption: API - - api diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index 438b23199..3c97d1ef9 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _user_guide_concepts: + Concepts ======== @@ -52,7 +54,7 @@ The first statement group: # create a context ctx = datafusion.SessionContext() -creates a :code:`SessionContext`, that is, the main interface for executing queries with DataFusion. It maintains the state +creates a :py:class:`~datafusion.context.SessionContext`, that is, the main interface for executing queries with DataFusion. It maintains the state of the connection between a user and an instance of the DataFusion engine. Additionally it provides the following functionality: - Create a DataFrame from a CSV or Parquet data source. @@ -72,9 +74,9 @@ The second statement group creates a :code:`DataFrame`, df = ctx.create_dataframe([[batch]]) A DataFrame refers to a (logical) set of rows that share the same column names, similar to a `Pandas DataFrame `_. -DataFrames are typically created by calling a method on :code:`SessionContext`, such as :code:`read_csv`, and can then be modified by -calling the transformation methods, such as :meth:`.DataFrame.filter`, :meth:`.DataFrame.select`, :meth:`.DataFrame.aggregate`, -and :meth:`.DataFrame.limit` to build up a query definition. +DataFrames are typically created by calling a method on :py:class:`~datafusion.context.SessionContext`, such as :code:`read_csv`, and can then be modified by +calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`, +and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition. The third statement uses :code:`Expressions` to build up a query definition. @@ -85,5 +87,5 @@ The third statement uses :code:`Expressions` to build up a query definition. col("a") - col("b"), ) -Finally the :code:`collect` method converts the logical plan represented by the DataFrame into a physical plan and execute it, -collecting all results into a list of `RecordBatch `_. \ No newline at end of file +Finally the :py:func:`~datafusion.dataframe.DataFrame.collect` method converts the logical plan represented by the DataFrame into a physical plan and execute it, +collecting all results into a list of `RecordBatch `_. diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst index 235d644e6..b9202129e 100644 --- a/docs/source/user-guide/common-operations/aggregations.rst +++ b/docs/source/user-guide/common-operations/aggregations.rst @@ -19,7 +19,7 @@ Aggregation ============ An aggregate or aggregation is a function where the values of multiple rows are processed together to form a single summary value. -For performing an aggregation, DataFusion provides the :meth:`.DataFrame.aggregate` +For performing an aggregation, DataFusion provides the :py:func:`~datafusion.dataframe.DataFrame.aggregate` .. ipython:: python diff --git a/docs/source/user-guide/common-operations/basic-info.rst b/docs/source/user-guide/common-operations/basic-info.rst index 424e1cc92..d48b49d5c 100644 --- a/docs/source/user-guide/common-operations/basic-info.rst +++ b/docs/source/user-guide/common-operations/basic-info.rst @@ -34,26 +34,26 @@ In this section, you will learn how to display essential details of DataFrames u }) df -Use :meth:`.DataFrame.limit` to view the top rows of the frame: +Use :py:func:`~datafusion.dataframe.DataFrame.limit` to view the top rows of the frame: .. ipython:: python df.limit(2) -Display the columns of the DataFrame using :meth:`.DataFrame.schema`: +Display the columns of the DataFrame using :py:func:`~datafusion.dataframe.DataFrame.schema`: .. ipython:: python df.schema() -The method :meth:`.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches, +The method :py:func:`~datafusion.dataframe.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches, passing them to an Arrow table, and then converting them to a pandas DataFrame. .. ipython:: python df.to_pandas() -:meth:`.DataFrame.describe` shows a quick statistic summary of your data: +:py:func:`~datafusion.dataframe.DataFrame.describe` shows a quick statistic summary of your data: .. ipython:: python diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index ebb514f14..c8f8b8f29 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _expressions: + Expressions =========== @@ -26,16 +28,16 @@ concept shared across most compilers and databases. Column ------ -The first expression most new users will interact with is the Column, which is created by calling :func:`col`. -This expression represents a column within a DataFrame. The function :func:`col` takes as in input a string +The first expression most new users will interact with is the Column, which is created by calling :py:func:`~datafusion.col`. +This expression represents a column within a DataFrame. The function :py:func:`~datafusion.col` takes as in input a string and returns an expression as it's output. Literal ------- Literal expressions represent a single value. These are helpful in a wide range of operations where -a specific, known value is of interest. You can create a literal expression using the function :func:`lit`. -The type of the object passed to the :func:`lit` function will be used to convert it to a known data type. +a specific, known value is of interest. You can create a literal expression using the function :py:func:`~datafusion.lit`. +The type of the object passed to the :py:func:`~datafusion.lit` function will be used to convert it to a known data type. In the following example we create expressions for the column named `color` and the literal scalar string `red`. The resultant variable `red_units` is itself also an expression. @@ -62,7 +64,7 @@ Functions --------- As mentioned before, most functions in DataFusion return an expression at their output. This allows us to create -a wide variety of expressions built up from other expressions. For example, :func:`.alias` is a function that takes +a wide variety of expressions built up from other expressions. For example, :py:func:`~datafusion.expr.Expr.alias` is a function that takes as it input a single expression and returns an expression in which the name of the expression has changed. The following example shows a series of expressions that are built up from functions operating on expressions. diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index d793314f7..a0b95c908 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -19,7 +19,7 @@ Functions ========= DataFusion provides a large number of built-in functions for performing complex queries without requiring user-defined functions. -In here we will cover some of the more popular use cases. If you want to view all the functions go to the :ref:`Functions` API Reference. +In here we will cover some of the more popular use cases. If you want to view all the functions go to the :py:mod:`Functions ` API Reference. We'll use the pokemon dataset in the following examples. @@ -40,7 +40,7 @@ We'll use the pokemon dataset in the following examples. Mathematical ------------ -DataFusion offers mathematical functions such as :func:`.pow` or :func:`.log` +DataFusion offers mathematical functions such as :py:func:`~datafusion.functions.pow` or :py:func:`~datafusion.functions.log` .. ipython:: python @@ -55,7 +55,7 @@ DataFusion offers mathematical functions such as :func:`.pow` or :func:`.log` Conditional ----------- -There 3 conditional functions in DataFusion :func:`.coalesce`, :func:`.nullif` and :func:`.case` (not available in Python) +There 3 conditional functions in DataFusion :py:func:`~datafusion.functions.coalesce`, :py:func:`~datafusion.functions.nullif` and :py:func:`~datafusion.functions.case`. .. ipython:: python @@ -66,13 +66,13 @@ There 3 conditional functions in DataFusion :func:`.coalesce`, :func:`.nullif` a Temporal -------- -For selecting the current time use :func:`.now` +For selecting the current time use :py:func:`~datafusion.functions.now` .. ipython:: python df.select(f.now()) -Convert to timestamps using :func:`.to_timestamp` +Convert to timestamps using :py:func:`~datafusion.functions.to_timestamp` .. ipython:: python @@ -92,7 +92,7 @@ DataFusion offers a range of helpful options. f.left(col('"Name"'), literal(4)).alias("code") ) -This also includes the functions for regular expressions like :func:`.regexp_replace` and :func:`.regexp_match` +This also includes the functions for regular expressions like :py:func:`~datafusion.functions.regexp_replace` and :py:func:`~datafusion.functions.regexp_match` .. ipython:: python @@ -105,7 +105,7 @@ This also includes the functions for regular expressions like :func:`.regexp_rep Other ----- -The function :func:`.in_list` allows to check a column for the presence of multiple values: +The function :py:func:`~datafusion.functions.in_list` allows to check a column for the presence of multiple values: .. ipython:: python diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.rst index 128203116..09fa145a7 100644 --- a/docs/source/user-guide/common-operations/joins.rst +++ b/docs/source/user-guide/common-operations/joins.rst @@ -18,7 +18,7 @@ Joins ===== -DataFusion supports the following join variants via the method :meth:`.DataFrame.join` +DataFusion supports the following join variants via the method :py:func:`~datafusion.dataframe.DataFrame.join` - Inner Join - Left Join @@ -58,7 +58,7 @@ will be included in the resulting DataFrame. left.join(right, join_keys=(["customer_id"], ["id"]), how="inner") -The parameter :code:`join_keys` specifies the columns from the left DataFrame and right DataFrame that contains the values +The parameter ``join_keys`` specifies the columns from the left DataFrame and right DataFrame that contains the values that should match. Left Join diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst b/docs/source/user-guide/common-operations/select-and-filter.rst index 8ede230e6..92b4841b2 100644 --- a/docs/source/user-guide/common-operations/select-and-filter.rst +++ b/docs/source/user-guide/common-operations/select-and-filter.rst @@ -18,7 +18,7 @@ Column Selections ================= -Use :meth:`.DataFrame.select_columns` for basic column selection. +Use :py:func:`~datafusion.dataframe.DataFrame.select` for basic column selection. DataFusion can work with several file types, to start simple we can use a subset of the `TLC Trip Record Data `_ @@ -35,8 +35,8 @@ DataFusion can work with several file types, to start simple we can use a subset df = ctx.read_parquet("yellow_trip_data.parquet") df.select_columns("trip_distance", "passenger_count") -For mathematical or logical operations use :func:`.col` to select columns, and give meaningful names to the resulting -operations using :func:`.alias` +For mathematical or logical operations use :py:func:`~datafusion.col` to select columns, and give meaningful names to the resulting +operations using :py:func:`~datafusion.expr.Expr.alias` .. ipython:: python @@ -48,7 +48,7 @@ operations using :func:`.alias` Please be aware that all identifiers are effectively made lower-case in SQL, so if your file has capital letters (ex: Name) you must put your column name in double quotes or the selection won’t work. As an alternative for simple - column selection use :meth:`.DataFrame.select_columns` without double quotes + column selection use :py:func:`~datafusion.dataframe.DataFrame.select_columns` without double quotes For selecting columns with capital letters use ``'"VendorID"'`` @@ -57,7 +57,7 @@ For selecting columns with capital letters use ``'"VendorID"'`` df.select(col('"VendorID"')) -To combine it with literal values use the :func:`.lit` +To combine it with literal values use the :py:func:`~datafusion.lit` .. ipython:: python diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst index 62d249c7e..54c685794 100644 --- a/docs/source/user-guide/common-operations/udf-and-udfa.rst +++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst @@ -19,7 +19,7 @@ User Defined Functions ====================== DataFusion provides powerful expressions and functions, reducing the need for custom Python functions. -However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs), with the :func:`.udf` function. +However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs), with the :py:func:`~datafusion.udf.ScalarUDF.udf` function. .. ipython:: python @@ -42,7 +42,7 @@ However you can still incorporate your own functions, i.e. User-Defined Function df.select(is_null_arr(col("a"))).to_pandas() -Additionally the :func:`.udaf` function allows you to define User-Defined Aggregate Functions (UDAFs) +Additionally the :py:func:`~datafusion.udf.AggregateUDF.udaf` function allows you to define User-Defined Aggregate Functions (UDAFs) .. code-block:: python diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst index f884c7e0d..5ef3c986c 100644 --- a/docs/source/user-guide/common-operations/windows.rst +++ b/docs/source/user-guide/common-operations/windows.rst @@ -21,7 +21,7 @@ Window Functions In this section you will learn about window functions. A window function utilizes values from one or multiple rows to produce a result for each individual row, unlike an aggregate function that provides a single value for multiple rows. -The functionality of window functions in DataFusion is supported by the dedicated :func:`.window` function. +The functionality of window functions in DataFusion is supported by the dedicated :py:func:`~datafusion.functions.window` function. We'll use the pokemon dataset (from Ritchie Vink) in the following examples. @@ -40,7 +40,7 @@ We'll use the pokemon dataset (from Ritchie Vink) in the following examples. ctx = SessionContext() df = ctx.read_csv("pokemon.csv") -Here is an example that shows how to compare each pokemons’s attack power with the average attack power in its :code:`"Type 1"` +Here is an example that shows how to compare each pokemons’s attack power with the average attack power in its ``"Type 1"`` .. ipython:: python @@ -54,7 +54,7 @@ Here is an example that shows how to compare each pokemons’s attack power with ) You can also control the order in which rows are processed by window functions by providing -a list of :func:`.order_by` functions for the :code:`order_by` parameter. +a list of ``order_by`` functions for the ``order_by`` parameter. .. ipython:: python diff --git a/docs/source/user-guide/configuration.rst b/docs/source/user-guide/configuration.rst index 0c1a4818a..7d330019f 100644 --- a/docs/source/user-guide/configuration.rst +++ b/docs/source/user-guide/configuration.rst @@ -18,8 +18,8 @@ Configuration ============= -Let's look at how we can configure DataFusion. When creating a :code:`SessionContext`, you can pass in -a :code:`SessionConfig` and :code:`RuntimeConfig` object. These two cover a wide range of options. +Let's look at how we can configure DataFusion. When creating a :py:class:`~datafusion.context.SessionContext`, you can pass in +a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.context.RuntimeConfig` object. These two cover a wide range of options. .. code-block:: python @@ -47,5 +47,5 @@ a :code:`SessionConfig` and :code:`RuntimeConfig` object. These two cover a wide print(ctx) -You can read more about available :code:`SessionConfig` options `here `_, -and about :code:`RuntimeConfig` options `here `_. +You can read more about available :py:class:`~datafusion.context.SessionConfig` options in the `rust DataFusion Configuration guide `_, +and about :code:`RuntimeConfig` options in the rust `online API documentation `_. diff --git a/docs/source/user-guide/io/avro.rst b/docs/source/user-guide/io/avro.rst index 85d546e2a..5f1ff728e 100644 --- a/docs/source/user-guide/io/avro.rst +++ b/docs/source/user-guide/io/avro.rst @@ -19,7 +19,7 @@ Avro ==== `Avro `_ is a serialization format for record data. Reading an avro file is very straightforward -with :meth:`.SessionContext.read_avro` +with :py:func:`~datafusion.context.SessionContext.read_avro` .. code-block:: python diff --git a/docs/source/user-guide/io/csv.rst b/docs/source/user-guide/io/csv.rst index 3f95c54a5..d2a62bfec 100644 --- a/docs/source/user-guide/io/csv.rst +++ b/docs/source/user-guide/io/csv.rst @@ -18,7 +18,7 @@ CSV === -Reading a csv is very straightforward with :meth:`.SessionContext.read_csv` +Reading a csv is very straightforward with :py:func:`~datafusion.context.SessionContext.read_csv` .. code-block:: python @@ -28,9 +28,9 @@ Reading a csv is very straightforward with :meth:`.SessionContext.read_csv` ctx = SessionContext() df = ctx.read_csv("file.csv") -An alternative is to use :meth:`.SessionContext.register_csv` +An alternative is to use :py:func:`~datafusion.context.SessionContext.register_csv` .. code-block:: python ctx.register_csv("file", "file.csv") - df = ctx.table("file") \ No newline at end of file + df = ctx.table("file") diff --git a/docs/source/user-guide/io/json.rst b/docs/source/user-guide/io/json.rst index 1ee065c44..f9da3755a 100644 --- a/docs/source/user-guide/io/json.rst +++ b/docs/source/user-guide/io/json.rst @@ -18,7 +18,7 @@ JSON ==== `JSON `_ (JavaScript Object Notation) is a lightweight data-interchange format. -When it comes to reading a JSON file, using :meth:`.SessionContext.read_json` is a simple and easy +When it comes to reading a JSON file, using :py:func:`~datafusion.context.SessionContext.read_json` is a simple and easy .. code-block:: python diff --git a/docs/source/user-guide/io/parquet.rst b/docs/source/user-guide/io/parquet.rst index 78bba30c5..75bc981cc 100644 --- a/docs/source/user-guide/io/parquet.rst +++ b/docs/source/user-guide/io/parquet.rst @@ -18,7 +18,7 @@ Parquet ======= -It is quite simple to read a parquet file using the :meth:`.SessionContext.read_parquet` function. +It is quite simple to read a parquet file using the :py:func:`~datafusion.context.SessionContext.read_parquet` function. .. code-block:: python @@ -28,9 +28,9 @@ It is quite simple to read a parquet file using the :meth:`.SessionContext.read_ ctx = SessionContext() df = ctx.read_parquet("file.parquet") -An alternative is to use :meth:`.SessionContext.register_parquet` +An alternative is to use :py:func:`~datafusion.context.SessionContext.register_parquet` .. code-block:: python ctx.register_parquet("file", "file.parquet") - df = ctx.table("file") \ No newline at end of file + df = ctx.table("file") diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py index e2d856749..5a6f548fb 100644 --- a/examples/python-udf-comparisons.py +++ b/examples/python-udf-comparisons.py @@ -22,7 +22,7 @@ import time path = os.path.dirname(os.path.abspath(__file__)) -filepath = os.path.join(path, "../tpch/data/lineitem.parquet") +filepath = os.path.join(path, "./tpch/data/lineitem.parquet") # This example serves to demonstrate alternate approaches to answering the # question "return all of the rows that have a specific combination of these @@ -122,7 +122,7 @@ def is_of_interest_impl( is_of_interest = udf( is_of_interest_impl, - [pa.int32(), pa.int32(), pa.utf8()], + [pa.int64(), pa.int64(), pa.utf8()], pa.bool_(), "stable", ) @@ -170,7 +170,7 @@ def udf_using_pyarrow_compute_impl( udf_using_pyarrow_compute = udf( udf_using_pyarrow_compute_impl, - [pa.int32(), pa.int32(), pa.utf8()], + [pa.int64(), pa.int64(), pa.utf8()], pa.bool_(), "stable", ) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 0569ac4b0..08ca3fe02 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -69,7 +69,9 @@ "ScalarUDF", "WindowFrame", "column", + "col", "literal", + "lit", "DFSchema", "runtime", "Catalog", @@ -93,7 +95,9 @@ def column(value: str): return Expr.column(value) -col = column +def col(value: str): + """Create a column expression.""" + return Expr.column(value) def literal(value): @@ -101,7 +105,10 @@ def literal(value): return Expr.literal(value) -lit = literal +def lit(value): + """Create a literal expression.""" + return Expr.literal(value) + udf = ScalarUDF.udf diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index cec0be764..acd28f33d 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -39,7 +39,7 @@ def names(self) -> list[str]: return self.catalog.names() def database(self, name: str = "public") -> Database: - """Returns the database with the given `name` from this catalog.""" + """Returns the database with the given ``name`` from this catalog.""" return Database(self.catalog.database(name)) @@ -55,7 +55,7 @@ def names(self) -> set[str]: return self.db.names() def table(self, name: str) -> Table: - """Return the table with the given `name` from this database.""" + """Return the table with the given ``name`` from this database.""" return Table(self.db.table(name)) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index a717db106..922cc87a3 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -46,7 +46,7 @@ class SessionConfig: """Session configuration options.""" def __init__(self, config_options: dict[str, str] | None = None) -> None: - """Create a new `SessionConfig` with the given configuration options. + """Create a new :py:class:`SessionConfig` with the given configuration options. Args: config_options: Configuration options. @@ -63,7 +63,7 @@ def with_create_default_catalog_and_schema( automatically created. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = ( self.config_internal.with_create_default_catalog_and_schema(enabled) @@ -80,7 +80,7 @@ def with_default_catalog_and_schema( schema: Schema name. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_default_catalog_and_schema( catalog, schema @@ -88,13 +88,13 @@ def with_default_catalog_and_schema( return self def with_information_schema(self, enabled: bool = True) -> SessionConfig: - """Enable or disable the inclusion of `information_schema` virtual tables. + """Enable or disable the inclusion of ``information_schema`` virtual tables. Args: - enabled: Whether to include `information_schema` virtual tables. + enabled: Whether to include ``information_schema`` virtual tables. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_information_schema(enabled) return self @@ -106,7 +106,7 @@ def with_batch_size(self, batch_size: int) -> SessionConfig: batch_size: Batch size. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_batch_size(batch_size) return self @@ -120,7 +120,7 @@ def with_target_partitions(self, target_partitions: int) -> SessionConfig: target_partitions: Number of target partitions. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_target_partitions( target_partitions @@ -136,7 +136,7 @@ def with_repartition_aggregations(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for aggregations. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_aggregations( enabled @@ -150,7 +150,7 @@ def with_repartition_joins(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for joins. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_joins(enabled) return self @@ -164,7 +164,7 @@ def with_repartition_windows(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for window functions. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_windows(enabled) return self @@ -178,7 +178,7 @@ def with_repartition_sorts(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for window functions. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_sorts(enabled) return self @@ -190,7 +190,7 @@ def with_repartition_file_scans(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for file scans. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_file_scans(enabled) return self @@ -202,7 +202,7 @@ def with_repartition_file_min_size(self, size: int) -> SessionConfig: size: Minimum file range size. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_file_min_size(size) return self @@ -216,7 +216,7 @@ def with_parquet_pruning(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use pruning predicate for parquet readers. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_parquet_pruning(enabled) return self @@ -229,7 +229,7 @@ def set(self, key: str, value: str) -> SessionConfig: value: Option value. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.set(key, value) return self @@ -239,14 +239,14 @@ class RuntimeConfig: """Runtime configuration options.""" def __init__(self) -> None: - """Create a new `RuntimeConfig` with default values.""" + """Create a new :py:class:`RuntimeConfig` with default values.""" self.config_internal = RuntimeConfigInternal() def with_disk_manager_disabled(self) -> RuntimeConfig: """Disable the disk manager, attempts to create temporary files will error. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_disk_manager_disabled() return self @@ -255,7 +255,7 @@ def with_disk_manager_os(self) -> RuntimeConfig: """Use the operating system's temporary directory for disk manager. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_disk_manager_os() return self @@ -267,7 +267,7 @@ def with_disk_manager_specified(self, *paths: str | pathlib.Path) -> RuntimeConf paths: Paths to use for the disk manager's temporary files. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. """ paths = [str(p) for p in paths] self.config_internal = self.config_internal.with_disk_manager_specified(paths) @@ -277,7 +277,7 @@ def with_unbounded_memory_pool(self) -> RuntimeConfig: """Use an unbounded memory pool. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_unbounded_memory_pool() return self @@ -303,7 +303,7 @@ def with_fair_spill_pool(self, size: int) -> RuntimeConfig: size: Size of the memory pool in bytes. Returns: - A new ``RuntimeConfig`` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. Examples usage:: @@ -316,14 +316,14 @@ def with_greedy_memory_pool(self, size: int) -> RuntimeConfig: """Use a greedy memory pool with the specified size. This pool works well for queries that do not need to spill or have a single - spillable operator. See `RuntimeConfig.with_fair_spill_pool` if there are + spillable operator. See :py:func:`with_fair_spill_pool` if there are multiple spillable operators that all will spill. Args: size: Size of the memory pool in bytes. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. Example usage:: @@ -339,7 +339,7 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeConfig: path: Path to use for temporary files. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. Example usage:: @@ -350,10 +350,10 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeConfig: class SQLOptions: - """Options to be used when performing SQL queries on the ``SessionContext``.""" + """Options to be used when performing SQL queries.""" def __init__(self) -> None: - """Create a new `SQLOptions` with default values. + """Create a new :py:class:`SQLOptions` with default values. The default values are: - DDL commands are allowed @@ -365,13 +365,13 @@ def __init__(self) -> None: def with_allow_ddl(self, allow: bool = True) -> SQLOptions: """Should DDL (Data Definition Language) commands be run? - Examples of DDL commands include `CREATE TABLE` and `DROP TABLE`. + Examples of DDL commands include ``CREATE TABLE`` and ``DROP TABLE``. Args: allow: Allow DDL commands to be run. Returns: - A new `SQLOptions` object with the updated setting. + A new :py:class:`SQLOptions` object with the updated setting. Example usage:: @@ -383,13 +383,13 @@ def with_allow_ddl(self, allow: bool = True) -> SQLOptions: def with_allow_dml(self, allow: bool = True) -> SQLOptions: """Should DML (Data Manipulation Language) commands be run? - Examples of DML commands include `INSERT INTO` and `DELETE`. + Examples of DML commands include ``INSERT INTO`` and ``DELETE``. Args: allow: Allow DML commands to be run. Returns: - A new `SQLOptions` object with the updated setting. + A new :py:class:`SQLOptions` object with the updated setting. Example usage:: @@ -399,13 +399,13 @@ def with_allow_dml(self, allow: bool = True) -> SQLOptions: return self def with_allow_statements(self, allow: bool = True) -> SQLOptions: - """Should statements such as `SET VARIABLE` and `BEGIN TRANSACTION` be run? + """Should statements such as ``SET VARIABLE`` and ``BEGIN TRANSACTION`` be run? Args: allow: Allow statements to be run. Returns: - A new `SQLOptions` object with the updated setting. + A new :py:class:SQLOptions` object with the updated setting. Example usage:: @@ -418,8 +418,7 @@ def with_allow_statements(self, allow: bool = True) -> SQLOptions: class SessionContext: """This is the main interface for executing queries and creating DataFrames. - See https://datafusion.apache.org/python/user-guide/basics.html for - additional information. + See :ref:`user_guide_concepts` in the online documentation for more information. """ def __init__( @@ -438,7 +437,7 @@ def __init__( Example usage: The following example demostrates how to use the context to execute - a query against a CSV data source using the ``DataFrame`` API:: + a query against a CSV data source using the :py:class:`DataFrame` API:: from datafusion import SessionContext @@ -455,7 +454,7 @@ def register_object_store(self, schema: str, store: Any, host: str | None) -> No Args: schema: The data source schema. - store: The `ObjectStore` to register. + store: The :py:class:`~datafusion.object_store.ObjectStore` to register. host: URL for the host. """ self.ctx.register_object_store(schema, store, host) @@ -471,8 +470,9 @@ def register_listing_table( ) -> None: """Register multiple files as a single table. - Registers a `Table` that can assemble multiple files from locations in - an `ObjectStore` instance. + Registers a :py:class:`~datafusion.catalog.Table` that can assemble multiple + files from locations in an :py:class:`~datafusion.object_store.ObjectStore` + instance. Args: name: Name of the resultant table. @@ -496,11 +496,12 @@ def register_listing_table( ) def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: - """Create a `DataFrame` from SQL query text. + """Create a :py:class:`~datafusion.DataFrame` from SQL query text. - Note: This API implements DDL statements such as `CREATE TABLE` and - `CREATE VIEW` and DML statements such as `INSERT INTO` with in-memory - default implementation. See `SessionContext.sql_with_options`. + Note: This API implements DDL statements such as ``CREATE TABLE`` and + ``CREATE VIEW`` and DML statements such as ``INSERT INTO`` with in-memory + default implementation.See + :py:func:`~datafusion.context.SessionContext.sql_with_options`. Args: query: SQL query text. @@ -514,7 +515,7 @@ def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: return DataFrame(self.ctx.sql_with_options(query, options.options_internal)) def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: - """Create a `DataFrame` from SQL query text. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from SQL query text. This function will first validating that the query is allowed by the provided options. @@ -537,7 +538,7 @@ def create_dataframe( """Create and return a dataframe using the provided partitions. Args: - partitions: `RecordBatch` partitions to register. + partitions: :py:class:`pyarrow.RecordBatch` partitions to register. name: Resultant dataframe name. schema: Schema for the partitions. @@ -547,7 +548,7 @@ def create_dataframe( return DataFrame(self.ctx.create_dataframe(partitions, name, schema)) def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame: - """Create a `DataFrame` from an existing logical plan. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from an existing plan. Args: plan: Logical plan. @@ -560,7 +561,7 @@ def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame: def from_pylist( self, data: list[dict[str, Any]], name: str | None = None ) -> DataFrame: - """Create a `DataFrame` from a list of dictionaries. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a list. Args: data: List of dictionaries. @@ -574,7 +575,7 @@ def from_pylist( def from_pydict( self, data: dict[str, list[Any]], name: str | None = None ) -> DataFrame: - """Create a `DataFrame` from a dictionary of lists. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a dictionary. Args: data: Dictionary of lists. @@ -588,7 +589,7 @@ def from_pydict( def from_arrow_table( self, data: pyarrow.Table, name: str | None = None ) -> DataFrame: - """Create a `DataFrame` from an Arrow table. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow table. Args: data: Arrow table. @@ -600,7 +601,7 @@ def from_arrow_table( return DataFrame(self.ctx.from_arrow_table(data, name)) def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFrame: - """Create a `DataFrame` from a Pandas DataFrame. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Pandas DataFrame. Args: data: Pandas DataFrame. @@ -612,7 +613,7 @@ def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFr return DataFrame(self.ctx.from_pandas(data, name)) def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFrame: - """Create a `DataFrame` from a Polars DataFrame. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Polars DataFrame. Args: data: Polars DataFrame. @@ -799,7 +800,7 @@ def register_avro( ) def register_dataset(self, name: str, dataset: pyarrow.dataset.Dataset) -> None: - """Register a `pyarrow.dataset.Dataset` as a table. + """Register a :py:class:`pyarrow.dataset.Dataset` as a table. Args: name: Name of the table to register. @@ -809,18 +810,18 @@ def register_dataset(self, name: str, dataset: pyarrow.dataset.Dataset) -> None: def register_udf(self, udf: ScalarUDF) -> None: """Register a user-defined function (UDF) with the context.""" - self.ctx.register_udf(udf.udf) + self.ctx.register_udf(udf._udf) def register_udaf(self, udaf: AggregateUDF) -> None: """Register a user-defined aggregation function (UDAF) with the context.""" - self.ctx.register_udaf(udaf) + self.ctx.register_udaf(udaf._udaf) def catalog(self, name: str = "datafusion") -> Catalog: """Retrieve a catalog by name.""" return self.ctx.catalog(name) @deprecated( - "Use the catalog provider interface `SessionContext.catalog` to " + "Use the catalog provider interface ``SessionContext.Catalog`` to " "examine available catalogs, schemas and tables" ) def tables(self) -> set[str]: @@ -828,7 +829,7 @@ def tables(self) -> set[str]: return self.ctx.tables() def table(self, name: str) -> DataFrame: - """Retrieve a `DataFrame` representing a previously registered table.""" + """Retrieve a previously registered table by name.""" return DataFrame(self.ctx.table(name)) def table_exist(self, name: str) -> bool: @@ -836,11 +837,11 @@ def table_exist(self, name: str) -> bool: return self.ctx.table_exist(name) def empty_table(self) -> DataFrame: - """Create an empty `DataFrame`.""" + """Create an empty :py:class:`~datafusion.dataframe.DataFrame`.""" return DataFrame(self.ctx.empty_table()) def session_id(self) -> str: - """Retrun an id that uniquely identifies this `SessionContext`.""" + """Retrun an id that uniquely identifies this :py:class:`SessionContext`.""" return self.ctx.session_id() def read_json( @@ -852,7 +853,7 @@ def read_json( table_partition_cols: list[tuple[str, str]] | None = None, file_compression_type: str | None = None, ) -> DataFrame: - """Create a `DataFrame` for reading a line-delimited JSON data source. + """Read a line-delimited JSON data source. Args: path: Path to the JSON file. @@ -891,7 +892,7 @@ def read_csv( table_partition_cols: list[tuple[str, str]] | None = None, file_compression_type: str | None = None, ) -> DataFrame: - """Create a `DataFrame` for reading a CSV data source. + """Read a CSV data source. Args: path: Path to the CSV file @@ -936,7 +937,7 @@ def read_parquet( schema: pyarrow.Schema | None = None, file_sort_order: list[list[Expr]] | None = None, ) -> DataFrame: - """Create a `DataFrame` for reading Parquet data source. + """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. Args: path: Path to the Parquet file. @@ -977,7 +978,7 @@ def read_avro( file_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".avro", ) -> DataFrame: - """Create a ``DataFrame`` for reading Avro data source. + """Create a :py:class:`DataFrame` for reading Avro data source. Args: path: Path to the Avro file. @@ -995,9 +996,14 @@ def read_avro( ) def read_table(self, table: Table) -> DataFrame: - """Creates a ``DataFrame`` for a ``Table`` such as a ``ListingTable``.""" + """Creates a :py:class:`~datafusion.dataframe.DataFrame` from a table. + + For a :py:class:`~datafusion.catalog.Table` such as a + :py:class:`~datafusion.catalog.ListingTable`, create a + :py:class:`~datafusion.dataframe.DataFrame`. + """ return DataFrame(self.ctx.read_table(table)) def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: - """Execute the `plan` and return the results.""" + """Execute the ``plan`` and return the results.""" return RecordBatchStream(self.ctx.execute(plan, partitions)) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 68e6298f7..fa7398442 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -14,10 +14,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""DataFrame is one of the core concepts in DataFusion. +""":py:class:`DataFrame` is one of the core concepts in DataFusion. -See https://datafusion.apache.org/python/user-guide/basics.html for more -information. +See :ref:`user_guide_concepts` in the online documentation for more information. """ from __future__ import annotations @@ -43,19 +42,19 @@ class DataFrame: """Two dimensional table representation of data. - See https://datafusion.apache.org/python/user-guide/basics.html for more - information. + See :ref:`user_guide_concepts` in the online documentation for more information. """ def __init__(self, df: DataFrameInternal) -> None: """This constructor is not to be used by the end user. - See ``SessionContext`` for methods to create DataFrames. + See :py:class:`~datafusion.context.SessionContext` for methods to + create a :py:class:`DataFrame`. """ self.df = df def __getitem__(self, key: str | List[str]) -> DataFrame: - """Return a new `DataFrame` with the specified column or columns. + """Return a new :py:class`DataFrame` with the specified column or columns. Args: key: Column name or list of column names to select. @@ -74,7 +73,7 @@ def __repr__(self) -> str: return self.df.__repr__() def describe(self) -> DataFrame: - """Return a new `DataFrame` that has statistics for a DataFrame. + """Return the statistics for this DataFrame. Only summarized numeric datatypes at the moments and returns nulls for non-numeric datatypes. @@ -87,7 +86,7 @@ def describe(self) -> DataFrame: return DataFrame(self.df.describe()) def schema(self) -> pa.Schema: - """Return the `pyarrow.Schema` describing the output of this DataFrame. + """Return the :py:class:`pyarrow.Schema` of this DataFrame. The output schema contains information on the name, data type, and nullability for each column. @@ -106,10 +105,10 @@ def select_columns(self, *args: str) -> DataFrame: return self.select(*args) def select(self, *exprs: Expr | str) -> DataFrame: - """Project arbitrary expressions into a new `DataFrame`. + """Project arbitrary expressions into a new :py:class:`DataFrame`. Args: - exprs: Either column names or `Expr` to select. + exprs: Either column names or :py:class:`~datafusion.expr.Expr` to select. Returns: DataFrame after projection. It has one column for each expression. @@ -117,9 +116,9 @@ def select(self, *exprs: Expr | str) -> DataFrame: Example usage: The following example will return 3 columns from the original dataframe. - The first two columns will be the original column `a` and `b` since the + The first two columns will be the original column ``a`` and ``b`` since the string "a" is assumed to refer to column selection. Also a duplicate of - column `a` will be returned with the column name `alternate_a`:: + column ``a`` will be returned with the column name ``alternate_a``:: df = df.select("a", col("b"), col("a").alias("alternate_a")) @@ -131,12 +130,12 @@ def select(self, *exprs: Expr | str) -> DataFrame: return DataFrame(self.df.select(*exprs)) def filter(self, *predicates: Expr) -> DataFrame: - """Return a DataFrame for which `predicate` evaluates to `True`. + """Return a DataFrame for which ``predicate`` evaluates to ``True``. - Rows for which `predicate` evaluates to `False` or `None` are filtered + Rows for which ``predicate`` evaluates to ``False`` or ``None`` are filtered out. If more than one predicate is provided, these predicates will be combined as a logical AND. If more complex logic is required, see the - logical operations in `datafusion.functions`. + logical operations in :py:mod:`~datafusion.functions`. Args: predicates: Predicate expression(s) to filter the DataFrame. @@ -162,12 +161,12 @@ def with_column(self, name: str, expr: Expr) -> DataFrame: return DataFrame(self.df.with_column(name, expr.expr)) def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame: - """Rename one column by applying a new projection. + r"""Rename one column by applying a new projection. This is a no-op if the column to be renamed does not exist. The method supports case sensitive rename with wrapping column name - into one the following symbols (" or ' or `). + into one the following symbols (" or ' or \`). Args: old_name: Old column name. @@ -196,7 +195,7 @@ def sort(self, *exprs: Expr) -> DataFrame: """Sort the DataFrame by the specified sorting expressions. Note that any expression can be turned into a sort expression by - calling its `sort` method. + calling its` ``sort`` method. Args: exprs: Sort expressions, applied in order. @@ -208,7 +207,7 @@ def sort(self, *exprs: Expr) -> DataFrame: return DataFrame(self.df.sort(*exprs)) def limit(self, count: int, offset: int = 0) -> DataFrame: - """Return a new `DataFrame` with a limited number of rows. + """Return a new :py:class:`DataFrame` with a limited number of rows. Args: count: Number of rows to limit the DataFrame to. @@ -220,14 +219,14 @@ def limit(self, count: int, offset: int = 0) -> DataFrame: return DataFrame(self.df.limit(count, offset)) def collect(self) -> list[pa.RecordBatch]: - """Execute this `DataFrame` and collect results into memory. + """Execute this :py:class:`DataFrame` and collect results into memory. - Prior to calling `collect`, modifying a DataFrme simply updates a plan - (no actual computation is performed). Calling `collect` triggers the + Prior to calling ``collect``, modifying a DataFrme simply updates a plan + (no actual computation is performed). Calling ``collect`` triggers the computation. Returns: - List of `pyarrow.RecordBatch`es collected from the DataFrame. + List of :py:class:`pyarrow.RecordBatch` collected from the DataFrame. """ return self.df.collect() @@ -242,11 +241,11 @@ def cache(self) -> DataFrame: def collect_partitioned(self) -> list[list[pa.RecordBatch]]: """Execute this DataFrame and collect all partitioned results. - This operation returns ``RecordBatch`` maintaining the input + This operation returns :py:class:`pyarrow.RecordBatch` maintaining the input partitioning. Returns: - List of list of ``RecordBatch`` collected from the + List of list of :py:class:`RecordBatch` collected from the DataFrame. """ return self.df.collect_partitioned() @@ -260,7 +259,7 @@ def show(self, num: int = 20) -> None: self.df.show(num) def distinct(self) -> DataFrame: - """Return a new `DataFrame` with all duplicated rows removed. + """Return a new :py:class:`DataFrame` with all duplicated rows removed. Returns: DataFrame after removing duplicates. @@ -273,7 +272,7 @@ def join( join_keys: tuple[list[str], list[str]], how: str, ) -> DataFrame: - """Join this `DataFrame` with another `DataFrame`. + """Join this :py:class:`DataFrame` with another :py:class:`DataFrame`. Join keys are a pair of lists of column names in the left and right dataframes, respectively. These lists must have the same length. @@ -292,11 +291,11 @@ def join( def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: """Return a DataFrame with the explanation of its plan so far. - If `analyze` is specified, runs the plan and reports metrics. + If ``analyze`` is specified, runs the plan and reports metrics. Args: - verbose: If `True`, more details will be included. - analyze: If `True`, the plan will run and metrics reported. + verbose: If ``True``, more details will be included. + analyze: If ``Tru`e``, the plan will run and metrics reported. Returns: DataFrame with the explanation of its plan. @@ -304,7 +303,7 @@ def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: return DataFrame(self.df.explain(verbose, analyze)) def logical_plan(self) -> LogicalPlan: - """Return the unoptimized `LogicalPlan` that comprises this `DataFrame`. + """Return the unoptimized ``LogicalPlan``. Returns: Unoptimized logical plan. @@ -312,7 +311,7 @@ def logical_plan(self) -> LogicalPlan: return self.df.logical_plan() def optimized_logical_plan(self) -> LogicalPlan: - """Return the optimized `LogicalPlan` that comprises this `DataFrame`. + """Return the optimized ``LogicalPlan``. Returns: Optimized logical plan. @@ -320,7 +319,7 @@ def optimized_logical_plan(self) -> LogicalPlan: return self.df.optimized_logical_plan() def execution_plan(self) -> ExecutionPlan: - """Return the execution/physical plan that comprises this `DataFrame`. + """Return the execution/physical plan. Returns: Execution plan. @@ -328,7 +327,7 @@ def execution_plan(self) -> ExecutionPlan: return self.df.execution_plan() def repartition(self, num: int) -> DataFrame: - """Repartition a DataFrame into `num` partitions. + """Repartition a DataFrame into ``num`` partitions. The batches allocation uses a round-robin algorithm. @@ -354,13 +353,13 @@ def repartition_by_hash(self, *exprs: Expr, num: int) -> DataFrame: return DataFrame(self.df.repartition_by_hash(*exprs, num=num)) def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: - """Calculate the union of two `DataFrame`s. + """Calculate the union of two :py:class:`DataFrame`. - The two `DataFrame`s must have exactly the same schema. + The two :py:class:`DataFrame` must have exactly the same schema. Args: other: DataFrame to union with. - distinct: If `True`, duplicate rows will be removed. + distinct: If ``True``, duplicate rows will be removed. Returns: DataFrame after union. @@ -368,9 +367,9 @@ def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: return DataFrame(self.df.union(other.df, distinct)) def union_distinct(self, other: DataFrame) -> DataFrame: - """Calculate the distinct union of two `DataFrame`s. + """Calculate the distinct union of two :py:class:`DataFrame`. - The two `DataFrame`s must have exactly the same schema. + The two :py:class:`DataFrame` must have exactly the same schema. Any duplicate rows are discarded. Args: @@ -382,9 +381,9 @@ def union_distinct(self, other: DataFrame) -> DataFrame: return DataFrame(self.df.union_distinct(other.df)) def intersect(self, other: DataFrame) -> DataFrame: - """Calculate the intersection of two `DataFrame`s. + """Calculate the intersection of two :py:class:`DataFrame`. - The two `DataFrame`s must have exactly the same schema. + The two :py:class:`DataFrame` must have exactly the same schema. Args: other: DataFrame to intersect with. @@ -395,9 +394,9 @@ def intersect(self, other: DataFrame) -> DataFrame: return DataFrame(self.df.intersect(other.df)) def except_all(self, other: DataFrame) -> DataFrame: - """Calculate the exception of two `DataFrame`s. + """Calculate the exception of two :py:class:`DataFrame`. - The two `DataFrame`s must have exactly the same schema. + The two :py:class:`DataFrame` must have exactly the same schema. Args: other: DataFrame to calculate exception with. @@ -408,7 +407,7 @@ def except_all(self, other: DataFrame) -> DataFrame: return DataFrame(self.df.except_all(other.df)) def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None: - """Execute the `DataFrame` and write the results to a CSV file. + """Execute the :py:class:`DataFrame` and write the results to a CSV file. Args: path: Path of the CSV file to write. @@ -422,7 +421,7 @@ def write_parquet( compression: str = "uncompressed", compression_level: int | None = None, ) -> None: - """Execute the `DataFrame` and write the results to a Parquet file. + """Execute the :py:class:`DataFrame` and write the results to a Parquet file. Args: path: Path of the Parquet file to write. @@ -432,7 +431,7 @@ def write_parquet( self.df.write_parquet(str(path), compression, compression_level) def write_json(self, path: str | pathlib.Path) -> None: - """Execute the `DataFrame` and write the results to a JSON file. + """Execute the :py:class:`DataFrame` and write the results to a JSON file. Args: path: Path of the JSON file to write. @@ -440,7 +439,7 @@ def write_json(self, path: str | pathlib.Path) -> None: self.df.write_json(str(path)) def to_arrow_table(self) -> pa.Table: - """Execute the `DataFrame` and convert it into an Arrow Table. + """Execute the :py:class:`DataFrame` and convert it into an Arrow Table. Returns: Arrow Table. @@ -465,7 +464,7 @@ def execute_stream_partitioned(self) -> list[RecordBatchStream]: return [RecordBatchStream(rbs) for rbs in streams] def to_pandas(self) -> pd.DataFrame: - """Execute the `DataFrame` and convert it into a Pandas DataFrame. + """Execute the :py:class:`DataFrame` and convert it into a Pandas DataFrame. Returns: Pandas DataFrame. @@ -473,7 +472,7 @@ def to_pandas(self) -> pd.DataFrame: return self.df.to_pandas() def to_pylist(self) -> list[dict[str, Any]]: - """Execute the `DataFrame` and convert it into a list of dictionaries. + """Execute the :py:class:`DataFrame` and convert it into a list of dictionaries. Returns: List of dictionaries. @@ -481,7 +480,7 @@ def to_pylist(self) -> list[dict[str, Any]]: return self.df.to_pylist() def to_pydict(self) -> dict[str, list[Any]]: - """Execute the `DataFrame` and convert it into a dictionary of lists. + """Execute the :py:class:`DataFrame` and convert it into a dictionary of lists. Returns: Dictionary of lists. @@ -489,7 +488,7 @@ def to_pydict(self) -> dict[str, list[Any]]: return self.df.to_pydict() def to_polars(self) -> pl.DataFrame: - """Execute the `DataFrame` and convert it into a Polars DataFrame. + """Execute the :py:class:`DataFrame` and convert it into a Polars DataFrame. Returns: Polars DataFrame. @@ -497,7 +496,7 @@ def to_polars(self) -> pl.DataFrame: return self.df.to_polars() def count(self) -> int: - """Return the total number of rows in this `DataFrame`. + """Return the total number of rows in this :py:class:`DataFrame`. Note that this method will actually run a plan to calculate the count, which may be slow for large or complicated DataFrames. @@ -507,9 +506,9 @@ def count(self) -> int: """ return self.df.count() - @deprecated("Use :func:`unnest_columns` instead.") + @deprecated("Use :py:func:`unnest_columns` instead.") def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame: - """See ``unnest_columns``.""" + """See :py:func:`unnest_columns`.""" return DataFrame(self.df.unnest_column(column, preserve_nulls=preserve_nulls)) def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFrame: diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 955c4e736..71fcf397b 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -17,7 +17,7 @@ """This module supports expressions, one of the core concepts in DataFusion. -See ``Expr`` for more details. +See :ref:`Expressions` in the online documentation for more details. """ from __future__ import annotations @@ -159,8 +159,7 @@ class Expr: """Expression object. Expressions are one of the core concepts in DataFusion. See - https://datafusion.apache.org/python/user-guide/common-operations/expressions.html - for more information. + :ref:`Expressions` in the online documentation for more information. """ def __init__(self, expr: expr_internal.Expr) -> None: @@ -328,7 +327,7 @@ def __lt__(self, rhs: Any) -> Expr: def literal(value: Any) -> Expr: """Creates a new expression representing a scalar value. - `value` must be a valid PyArrow scalar value or easily castable to one. + ``value`` must be a valid PyArrow scalar value or easily castable to one. """ if not isinstance(value, pa.Scalar): value = pa.scalar(value) @@ -336,7 +335,7 @@ def literal(value: Any) -> Expr: @staticmethod def column(value: str) -> Expr: - """Creates a new expression representing a column in a ``DataFrame``.""" + """Creates a new expression representing a column.""" return Expr(expr_internal.Expr.column(value)) def alias(self, name: str) -> Expr: @@ -344,7 +343,7 @@ def alias(self, name: str) -> Expr: return Expr(self.expr.alias(name)) def sort(self, ascending: bool = True, nulls_first: bool = True) -> Expr: - """Creates a sort ``Expr`` from an existing ``Expr``. + """Creates a sort :py:class:`Expr` from an existing :py:class:`Expr`. Args: ascending: If true, sort in ascending order. @@ -365,7 +364,7 @@ def rex_type(self) -> RexType: A Rex (Row Expression) specifies a single row of data.That specification could include user defined functions or types. RexType identifies the - row as one of the possible valid ``RexType``(s). + row as one of the possible valid ``RexType``. """ return self.expr.rex_type() @@ -416,12 +415,12 @@ def __init__( """Construct a window frame using the given parameters. Args: - units: Should be one of `rows`, `range`, or `groups`. + units: Should be one of ``rows``, ``range``, or ``groups``. start_bound: Sets the preceeding bound. Must be >= 0. If none, this - will be set to unbounded. If unit type is `groups`, this + will be set to unbounded. If unit type is ``groups``, this parameter must be set. end_bound: Sets the following bound. Must be >= 0. If none, this - will be set to unbounded. If unit type is `groups`, this + will be set to unbounded. If unit type is ``groups``, this parameter must be set. """ self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound) @@ -442,7 +441,7 @@ def get_upper_bound(self): class WindowFrameBound: """Defines a single window frame bound. - ``WindowFrame`` typically requires a start and end bound. + :py:class:`WindowFrame` typically requires a start and end bound. """ def __init__(self, frame_bound: expr_internal.WindowFrameBound) -> None: @@ -489,7 +488,7 @@ def __init__(self, case_builder: expr_internal.CaseBuilder) -> None: """Constructs a case builder. This is not typically called by the end user directly. See - ``datafusion.functions.case`` instead. + :py:func:`datafusion.functions.case` instead. """ self.case_builder = case_builder diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index be83d359f..82b5056d7 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""This module contains the user functions for operating on ``Expr``.""" +"""User functions for operating on :py:class:`~datafusion.expr.Expr`.""" from __future__ import annotations @@ -263,12 +263,12 @@ def nullif(expr1: Expr, expr2: Expr) -> Expr: def encode(input: Expr, encoding: Expr) -> Expr: - """Encode the `input`, using the `encoding`. encoding can be base64 or hex.""" + """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" return Expr(f.encode(input.expr, encoding.expr)) def decode(input: Expr, encoding: Expr) -> Expr: - """Decode the `input`, using the `encoding`. encoding can be base64 or hex.""" + """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" return Expr(f.decode(input.expr, encoding.expr)) @@ -280,7 +280,7 @@ def array_to_string(expr: Expr, delimiter: Expr) -> Expr: def array_join(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. - This is an alias for :func:`array_to_string`. + This is an alias for :py:func:`array_to_string`. """ return array_to_string(expr, delimiter) @@ -288,7 +288,7 @@ def array_join(expr: Expr, delimiter: Expr) -> Expr: def list_to_string(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. - This is an alias for :func:`array_to_string`. + This is an alias for :py:func:`array_to_string`. """ return array_to_string(expr, delimiter) @@ -296,13 +296,13 @@ def list_to_string(expr: Expr, delimiter: Expr) -> Expr: def list_join(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. - This is an alias for :func:`array_to_string`. + This is an alias for :py:func:`array_to_string`. """ return array_to_string(expr, delimiter) def in_list(arg: Expr, values: list[Expr], negated: bool = False) -> Expr: - """Returns whether the argument is contained within the list `values`.""" + """Returns whether the argument is contained within the list ``values``.""" values = [v.expr for v in values] return Expr(f.in_list(arg.expr, values, negated)) @@ -326,9 +326,9 @@ def concat(*args: Expr) -> Expr: def concat_ws(separator: str, *args: Expr) -> Expr: - """Concatenates the list `args` with the separator. + """Concatenates the list ``args`` with the separator. - `NULL` arugments are ignored. `separator` should not be `NULL`. + ``NULL`` arugments are ignored. ``separator`` should not be ``NULL``. """ args = [arg.expr for arg in args] return Expr(f.concat_ws(separator, args)) @@ -355,9 +355,11 @@ def count_star() -> Expr: def case(expr: Expr) -> CaseBuilder: - """Create a ``CaseBuilder`` to match cases for the expression ``expr``. + """Create a case expression. - See ``datafusion.expr.CaseBuilder`` for detailed usage of ``CaseBuilder``. + Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the + expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for + detailed usage. """ return CaseBuilder(f.case(expr.expr)) @@ -462,12 +464,12 @@ def character_length(arg: Expr) -> Expr: def length(string: Expr) -> Expr: - """The number of characters in the `string`.""" + """The number of characters in the ``string``.""" return Expr(f.length(string.expr)) def char_length(string: Expr) -> Expr: - """The number of characters in the `string`.""" + """The number of characters in the ``string``.""" return Expr(f.char_length(string.expr)) @@ -477,7 +479,7 @@ def chr(arg: Expr) -> Expr: def coalesce(*args: Expr) -> Expr: - """Returns the value of the first expr in `args` which is not NULL.""" + """Returns the value of the first expr in ``args`` which is not NULL.""" args = [arg.expr for arg in args] return Expr(f.coalesce(*args)) @@ -503,7 +505,7 @@ def degrees(arg: Expr) -> Expr: def ends_with(arg: Expr, suffix: Expr) -> Expr: - """Returns true if the `string` ends with the `suffix`, false otherwise.""" + """Returns true if the ``string`` ends with the ``suffix``, false otherwise.""" return Expr(f.ends_with(arg.expr, suffix.expr)) @@ -521,9 +523,9 @@ def find_in_set(string: Expr, string_list: Expr) -> Expr: """Find a string in a list of strings. Returns a value in the range of 1 to N if the string is in the string list - `string_list` consisting of N substrings. + ``string_list`` consisting of N substrings. - The string list is a string composed of substrings separated by `,` characters. + The string list is a string composed of substrings separated by ``,`` characters. """ return Expr(f.find_in_set(string.expr, string_list.expr)) @@ -541,16 +543,16 @@ def gcd(x: Expr, y: Expr) -> Expr: def initcap(string: Expr) -> Expr: """Set the initial letter of each word to capital. - Converts the first letter of each word in `string` to uppercase and the remaining + Converts the first letter of each word in ``string`` to uppercase and the remaining characters to lowercase. """ return Expr(f.initcap(string.expr)) def instr(string: Expr, substring: Expr) -> Expr: - """Finds the position from where the `substring` matches the `string`. + """Finds the position from where the ``substring`` matches the ``string``. - This is an alias for :func:`strpos`. + This is an alias for :py:func:`strpos`. """ return strpos(string, substring) @@ -566,7 +568,7 @@ def lcm(x: Expr, y: Expr) -> Expr: def left(string: Expr, n: Expr) -> Expr: - """Returns the first `n` characters in the `string`.""" + """Returns the first ``n`` characters in the ``string``.""" return Expr(f.left(string.expr, n.expr)) @@ -581,7 +583,7 @@ def ln(arg: Expr) -> Expr: def log(base: Expr, num: Expr) -> Expr: - """Returns the logarithm of a number for a particular `base`.""" + """Returns the logarithm of a number for a particular ``base``.""" return Expr(f.log(base.expr, num.expr)) @@ -622,7 +624,7 @@ def md5(arg: Expr) -> Expr: def nanvl(x: Expr, y: Expr) -> Expr: - """Returns `x` if `x` is not `NaN`. Otherwise returns `y`.""" + """Returns ``x`` if ``x`` is not ``NaN``. Otherwise returns ``y``.""" return Expr(f.nanvl(x.expr, y.expr)) @@ -636,8 +638,8 @@ def overlay( ) -> Expr: """Replace a substring with a new substring. - Replace the substring of string that starts at the `start`'th character and - extends for `length` characters with new substring. + Replace the substring of string that starts at the ``start``'th character and + extends for ``length`` characters with new substring. """ if length is None: return Expr(f.overlay(string.expr, substring.expr, start.expr)) @@ -650,22 +652,22 @@ def pi() -> Expr: def position(string: Expr, substring: Expr) -> Expr: - """Finds the position from where the `substring` matches the `string`. + """Finds the position from where the ``substring`` matches the ``string``. - This is an alias for :func:`strpos`. + This is an alias for :py:func:`strpos`. """ return strpos(string, substring) def power(base: Expr, exponent: Expr) -> Expr: - """Returns `base` raised to the power of `exponent`.""" + """Returns ``base`` raised to the power of ``exponent``.""" return Expr(f.power(base.expr, exponent.expr)) def pow(base: Expr, exponent: Expr) -> Expr: - """Returns `base` raised to the power of `exponent`. + """Returns ``base`` raised to the power of ``exponent``. - This is an alias of `power`. + This is an alias of :py:func:`power`. """ return power(base, exponent) @@ -690,7 +692,7 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: """Perform regular expression (regex) matching. Returns an array with each element containing the leftmost-first match of the - corresponding index in `regex` to string in `string`. + corresponding index in ``regex`` to string in ``string``. """ if flags is not None: flags = flags.expr @@ -714,12 +716,12 @@ def regexp_replace( def repeat(string: Expr, n: Expr) -> Expr: - """Repeats the `string` to `n` times.""" + """Repeats the ``string`` to ``n`` times.""" return Expr(f.repeat(string.expr, n.expr)) def replace(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces all occurrences of `from` with `to` in the `string`.""" + """Replaces all occurrences of ``from_val`` with ``to_val`` in the ``string``.""" return Expr(f.replace(string.expr, from_val.expr, to_val.expr)) @@ -729,7 +731,7 @@ def reverse(arg: Expr) -> Expr: def right(string: Expr, n: Expr) -> Expr: - """Returns the last `n` characters in the `string`.""" + """Returns the last ``n`` characters in the ``string``.""" return Expr(f.right(string.expr, n.expr)) @@ -738,7 +740,7 @@ def round(value: Expr, decimal_places: Expr = Expr.literal(0)) -> Expr: If the optional ``decimal_places`` is specified, round to the nearest number of decimal places. You can specify a negative number of decimal places. For example - `round(lit(125.2345), lit(-2))` would yield a value of `100.0`. + ``round(lit(125.2345), lit(-2))`` would yield a value of ``100.0``. """ return Expr(f.round(value.expr, decimal_places.expr)) @@ -813,22 +815,26 @@ def starts_with(string: Expr, prefix: Expr) -> Expr: def strpos(string: Expr, substring: Expr) -> Expr: - """Finds the position from where the `substring` matches the `string`.""" + """Finds the position from where the ``substring`` matches the ``string``.""" return Expr(f.strpos(string.expr, substring.expr)) def substr(string: Expr, position: Expr) -> Expr: - """Substring from the `position` to the end.""" + """Substring from the ``position`` to the end.""" return Expr(f.substr(string.expr, position.expr)) def substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr: - """Returns the substring from `string` before `count` occurrences of `delimiter`.""" + """Returns an indexed substring. + + The return will be the ``string`` from before ``count`` occurrences of + ``delimiter``. + """ return Expr(f.substr_index(string.expr, delimiter.expr, count.expr)) def substring(string: Expr, position: Expr, length: Expr) -> Expr: - """Substring from the `position` with `length` characters.""" + """Substring from the ``position`` with ``length`` characters.""" return Expr(f.substring(string.expr, position.expr, length.expr)) @@ -856,7 +862,7 @@ def now() -> Expr: def to_timestamp(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in nanoseconds. + """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. For usage of ``formatters`` see the rust chrono package ``strftime`` package. @@ -870,33 +876,33 @@ def to_timestamp(arg: Expr, *formatters: Expr) -> Expr: def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in milliseconds. + """Converts a string and optional formats to a ``Timestamp`` in milliseconds. - See `to_timestamp` for a description on how to use formatters. + See :py:func:`to_timestamp` for a description on how to use formatters. """ return Expr(f.to_timestamp_millis(arg.expr, *formatters)) def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in microseconds. + """Converts a string and optional formats to a ``Timestamp`` in microseconds. - See `to_timestamp` for a description on how to use formatters. + See :py:func:`to_timestamp` for a description on how to use formatters. """ return Expr(f.to_timestamp_micros(arg.expr, *formatters)) def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in nanoseconds. + """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. - See `to_timestamp` for a description on how to use formatters. + See :py:func:`to_timestamp` for a description on how to use formatters. """ return Expr(f.to_timestamp_nanos(arg.expr, *formatters)) def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in seconds. + """Converts a string and optional formats to a ``Timestamp`` in seconds. - See `to_timestamp` for a description on how to use formatters. + See :py:func:`to_timestamp` for a description on how to use formatters. """ return Expr(f.to_timestamp_seconds(arg.expr, *formatters)) @@ -920,7 +926,7 @@ def current_time() -> Expr: def datepart(part: Expr, date: Expr) -> Expr: """Return a specified part of a date. - This is an alias for `date_part`. + This is an alias for :py:func:`date_part`. """ return date_part(part, date) @@ -938,7 +944,7 @@ def date_trunc(part: Expr, date: Expr) -> Expr: def datetrunc(part: Expr, date: Expr) -> Expr: """Truncates the date to a specified level of precision. - This is an alias for `date_trunc`. + This is an alias for :py:func:`date_trunc`. """ return date_trunc(part, date) @@ -954,7 +960,7 @@ def make_date(year: Expr, month: Expr, day: Expr) -> Expr: def translate(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces the characters in `from_val` with the counterpart in `to_val`.""" + """Replaces the characters in ``from_val`` with the counterpart in ``to_val``.""" return Expr(f.translate(string.expr, from_val.expr, to_val.expr)) @@ -984,7 +990,7 @@ def make_array(*args: Expr) -> Expr: def array(*args: Expr) -> Expr: """Returns an array using the specified input expressions. - This is an alias for `make_array`. + This is an alias for :py:func:`make_array`. """ return make_array(args) @@ -1025,7 +1031,7 @@ def arrow_typeof(arg: Expr) -> Expr: def random() -> Expr: - """Returns a random value in the range `0.0 <= x < 1.0`.""" + """Returns a random value in the range ``0.0 <= x < 1.0``.""" return Expr(f.random()) @@ -1037,7 +1043,7 @@ def array_append(array: Expr, element: Expr) -> Expr: def array_push_back(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. - This is an alias for `array_append`. + This is an alias for :py:func:`array_append`. """ return array_append(array, element) @@ -1045,7 +1051,7 @@ def array_push_back(array: Expr, element: Expr) -> Expr: def list_append(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. - This is an alias for `array_append`. + This is an alias for :py:func:`array_append`. """ return array_append(array, element) @@ -1053,7 +1059,7 @@ def list_append(array: Expr, element: Expr) -> Expr: def list_push_back(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. - This is an alias for `array_append`. + This is an alias for :py:func:`array_append`. """ return array_append(array, element) @@ -1067,7 +1073,7 @@ def array_concat(*args: Expr) -> Expr: def array_cat(*args: Expr) -> Expr: """Concatenates the input arrays. - This is an alias for `array_concat`. + This is an alias for :py:func:`array_concat`. """ return array_concat(*args) @@ -1085,7 +1091,7 @@ def array_distinct(array: Expr) -> Expr: def list_distinct(array: Expr) -> Expr: """Returns distinct values from the array after removing duplicates. - This is an alias for `array_distinct`. + This is an alias for :py:func:`array_distinct`. """ return array_distinct(array) @@ -1093,7 +1099,7 @@ def list_distinct(array: Expr) -> Expr: def list_dims(array: Expr) -> Expr: """Returns an array of the array's dimensions. - This is an alias for `array_dims`. + This is an alias for :py:func:`array_dims`. """ return array_dims(array) @@ -1106,7 +1112,7 @@ def array_element(array: Expr, n: Expr) -> Expr: def array_extract(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. - This is an alias for `array_element`. + This is an alias for :py:func:`array_element`. """ return array_element(array, n) @@ -1114,7 +1120,7 @@ def array_extract(array: Expr, n: Expr) -> Expr: def list_element(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. - This is an alias for `array_element`. + This is an alias for :py:func:`array_element`. """ return array_element(array, n) @@ -1122,7 +1128,7 @@ def list_element(array: Expr, n: Expr) -> Expr: def list_extract(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. - This is an alias for `array_element`. + This is an alias for :py:func:`array_element`. """ return array_element(array, n) @@ -1135,7 +1141,7 @@ def array_length(array: Expr) -> Expr: def list_length(array: Expr) -> Expr: """Returns the length of the array. - This is an alias for `array_length`. + This is an alias for :py:func:`array_length`. """ return array_length(array) @@ -1171,7 +1177,7 @@ def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: def array_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. - This is an alias for `array_position`. + This is an alias for :py:func:`array_position`. """ return array_position(array, element, index) @@ -1179,7 +1185,7 @@ def array_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: def list_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. - This is an alias for `array_position`. + This is an alias for :py:func:`array_position`. """ return array_position(array, element, index) @@ -1187,7 +1193,7 @@ def list_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: def list_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. - This is an alias for `array_position`. + This is an alias for :py:func:`array_position`. """ return array_position(array, element, index) @@ -1200,7 +1206,7 @@ def array_positions(array: Expr, element: Expr) -> Expr: def list_positions(array: Expr, element: Expr) -> Expr: """Searches for an element in the array and returns all occurrences. - This is an alias for `array_positions`. + This is an alias for :py:func:`array_positions`. """ return array_positions(array, element) @@ -1213,7 +1219,7 @@ def array_ndims(array: Expr) -> Expr: def list_ndims(array: Expr) -> Expr: """Returns the number of dimensions of the array. - This is an alias for `array_ndims`. + This is an alias for :py:func:`array_ndims`. """ return array_ndims(array) @@ -1226,7 +1232,7 @@ def array_prepend(element: Expr, array: Expr) -> Expr: def array_push_front(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. - This is an alias for `array_prepend`. + This is an alias for :py:func:`array_prepend`. """ return array_prepend(element, array) @@ -1234,7 +1240,7 @@ def array_push_front(element: Expr, array: Expr) -> Expr: def list_prepend(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. - This is an alias for `array_prepend`. + This is an alias for :py:func:`array_prepend`. """ return array_prepend(element, array) @@ -1242,7 +1248,7 @@ def list_prepend(element: Expr, array: Expr) -> Expr: def list_push_front(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. - This is an alias for `array_prepend`. + This is an alias for :py:func:`array_prepend`. """ return array_prepend(element, array) @@ -1265,20 +1271,20 @@ def array_remove(array: Expr, element: Expr) -> Expr: def list_remove(array: Expr, element: Expr) -> Expr: """Removes the first element from the array equal to the given value. - This is an alias for `array_remove`. + This is an alias for :py:func:`array_remove`. """ return array_remove(array, element) def array_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: - """Removes the first `max` elements from the array equal to the given value.""" + """Removes the first ``max`` elements from the array equal to the given value.""" return Expr(f.array_remove_n(array.expr, element.expr, max.expr)) def list_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: - """Removes the first `max` elements from the array equal to the given value. + """Removes the first ``max`` elements from the array equal to the given value. - This is an alias for `array_remove_n`. + This is an alias for :py:func:`array_remove_n`. """ return array_remove_n(array, element, max) @@ -1291,13 +1297,13 @@ def array_remove_all(array: Expr, element: Expr) -> Expr: def list_remove_all(array: Expr, element: Expr) -> Expr: """Removes all elements from the array equal to the given value. - This is an alias for `array_remove_all`. + This is an alias for :py:func:`array_remove_all`. """ return array_remove_all(array, element) def array_repeat(element: Expr, count: Expr) -> Expr: - """Returns an array containing `element` `count` times.""" + """Returns an array containing ``element`` ``count`` times.""" return Expr(f.array_repeat(element.expr, count.expr)) @@ -1309,27 +1315,27 @@ def array_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: def list_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces the first occurrence of ``from_val`` with ``to_val``. - This is an alias for `array_replace`. + This is an alias for :py:func:`array_replace`. """ return array_replace(array, from_val, to_val) def array_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr: - """Replace `n` occurrences of ``from_val`` with ``to_val``. + """Replace ``n`` occurrences of ``from_val`` with ``to_val``. - Replaces the first `max` occurrences of the specified element with another + Replaces the first ``max`` occurrences of the specified element with another specified element. """ return Expr(f.array_replace_n(array.expr, from_val.expr, to_val.expr, max.expr)) def list_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr: - """Replace `n` occurrences of ``from_val`` with ``to_val``. + """Replace ``n`` occurrences of ``from_val`` with ``to_val``. - Replaces the first `max` occurrences of the specified element with another + Replaces the first ``max`` occurrences of the specified element with another specified element. - This is an alias for `array_replace_n`. + This is an alias for :py:func:`array_replace_n`. """ return array_replace_n(array, from_val, to_val, max) @@ -1342,7 +1348,7 @@ def array_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: def list_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces all occurrences of ``from_val`` with ``to_val``. - This is an alias for `array_replace_all`. + This is an alias for :py:func:`array_replace_all`. """ return array_replace_all(array, from_val, to_val) @@ -1365,7 +1371,7 @@ def array_sort(array: Expr, descending: bool = False, null_first: bool = False) def list_sort(array: Expr, descending: bool = False, null_first: bool = False) -> Expr: - """This is an alias for ``array_sort``.""" + """This is an alias for :py:func:`array_sort`.""" return array_sort(array, descending=descending, null_first=null_first) @@ -1381,20 +1387,20 @@ def array_slice( def list_slice(array: Expr, begin: Expr, end: Expr, stride: Expr | None = None) -> Expr: """Returns a slice of the array. - This is an alias for `array_slice`. + This is an alias for :py:func:`array_slice`. """ return array_slice(array, begin, end, stride) def array_intersect(array1: Expr, array2: Expr) -> Expr: - """Returns an array of the elements in the intersection of array1 and array2.""" + """Returns the intersection of ``array1`` and ``array2``.""" return Expr(f.array_intersect(array1.expr, array2.expr)) def list_intersect(array1: Expr, array2: Expr) -> Expr: - """Returns an array of the elements in the intersection of `array1` and `array2`. + """Returns an the intersection of ``array1`` and ``array2``. - This is an alias for `array_intersect`. + This is an alias for :py:func:`array_intersect`. """ return array_intersect(array1, array2) @@ -1412,20 +1418,20 @@ def list_union(array1: Expr, array2: Expr) -> Expr: Duplicate rows will not be returned. - This is an alias for `array_union`. + This is an alias for :py:func:`array_union`. """ return array_union(array1, array2) def array_except(array1: Expr, array2: Expr) -> Expr: - """Returns an array of the elements that appear in `array1` but not in `array2`.""" + """Returns the elements that appear in ``array1`` but not in ``array2``.""" return Expr(f.array_except(array1.expr, array2.expr)) def list_except(array1: Expr, array2: Expr) -> Expr: - """Returns an array of the elements that appear in `array1` but not in the `array2`. + """Returns the elements that appear in ``array1`` but not in the ``array2``. - This is an alias for `array_except`. + This is an alias for :py:func:`array_except`. """ return array_except(array1, array2) @@ -1433,8 +1439,8 @@ def list_except(array1: Expr, array2: Expr) -> Expr: def array_resize(array: Expr, size: Expr, value: Expr) -> Expr: """Returns an array with the specified size filled. - If `size` is greater than the `array` length, the additional entries will be filled - with the given `value`. + If ``size`` is greater than the ``array`` length, the additional entries will + be filled with the given ``value``. """ return Expr(f.array_resize(array.expr, size.expr, value.expr)) @@ -1442,8 +1448,8 @@ def array_resize(array: Expr, size: Expr, value: Expr) -> Expr: def list_resize(array: Expr, size: Expr, value: Expr) -> Expr: """Returns an array with the specified size filled. - If `size` is greater than the `array` length, the additional entries will be - filled with the given `value`. This is an alias for `array_resize`. + If ``size`` is greater than the ``array`` length, the additional entries will be + filled with the given ``value``. This is an alias for :py:func:`array_resize`. """ return array_resize(array, size, value) @@ -1489,7 +1495,7 @@ def approx_percentile_cont_with_weight( ) -> Expr: """Returns the value of the approximate percentile. - This function is similar to ``approx_percentile_cont`` except that it uses + This function is similar to :py:func:`approx_percentile_cont` except that it uses the associated associated weights. """ return Expr( @@ -1510,7 +1516,7 @@ def avg(arg: Expr, distinct: bool = False) -> Expr: def corr(value1: Expr, value2: Expr, distinct: bool = False) -> Expr: - """Returns the correlation coefficient between `value1` and `value2`.""" + """Returns the correlation coefficient between ``value1`` and ``value2``.""" return Expr(f.corr(value1.expr, value2.expr, distinct=distinct)) @@ -1528,7 +1534,7 @@ def count(args: Expr | list[Expr] | None = None, distinct: bool = False) -> Expr def covar(y: Expr, x: Expr) -> Expr: """Computes the sample covariance. - This is an alias for `covar_samp`. + This is an alias for :py:func:`covar_samp`. """ return covar_samp(y, x) @@ -1559,7 +1565,7 @@ def max(arg: Expr, distinct: bool = False) -> Expr: def mean(arg: Expr, distinct: bool = False) -> Expr: """Returns the average (mean) value of the argument. - This is an alias for `avg`. + This is an alias for :py:func:`avg`. """ return avg(arg, distinct) @@ -1592,7 +1598,7 @@ def stddev_pop(arg: Expr, distinct: bool = False) -> Expr: def stddev_samp(arg: Expr, distinct: bool = False) -> Expr: """Computes the sample standard deviation of the argument. - This is an alias for `stddev`. + This is an alias for :py:func:`stddev`. """ return stddev(arg, distinct) @@ -1600,7 +1606,7 @@ def stddev_samp(arg: Expr, distinct: bool = False) -> Expr: def var(arg: Expr) -> Expr: """Computes the sample variance of the argument. - This is an alias for `var_samp`. + This is an alias for :py:func:`var_samp`. """ return var_samp(arg) @@ -1616,7 +1622,7 @@ def var_samp(arg: Expr) -> Expr: def regr_avgx(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the average of the independent variable `x`. + """Computes the average of the independent variable ``x``. Only non-null pairs of the inputs are evaluated. """ @@ -1652,7 +1658,7 @@ def regr_slope(y: Expr, x: Expr, distinct: bool = False) -> Expr: def regr_sxx(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the sum of squares of the independent variable `x`.""" + """Computes the sum of squares of the independent variable ``x``.""" return Expr(f.regr_sxx(y.expr, x.expr, distinct)) @@ -1662,7 +1668,7 @@ def regr_sxy(y: Expr, x: Expr, distinct: bool = False) -> Expr: def regr_syy(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the sum of squares of the dependent variable `y`.""" + """Computes the sum of squares of the dependent variable ``y``.""" return Expr(f.regr_syy(y.expr, x.expr, distinct)) diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py index dcfd55485..44936f7d8 100644 --- a/python/datafusion/record_batch.py +++ b/python/datafusion/record_batch.py @@ -17,7 +17,8 @@ """This module provides the classes for handling record batches. -These are typically the result of dataframe `execute_stream` operations. +These are typically the result of dataframe +:py:func:`datafusion.dataframe.execute_stream` operations. """ from __future__ import annotations @@ -31,24 +32,25 @@ class RecordBatch: - """This class is essentially a wrapper for ``pyarrow.RecordBatch``.""" + """This class is essentially a wrapper for :py:class:`pyarrow.RecordBatch`.""" def __init__(self, record_batch: df_internal.RecordBatch) -> None: """This constructor is generally not called by the end user. - See the ``RecordBatchStream`` iterator for generating this class. + See the :py:class:`RecordBatchStream` iterator for generating this class. """ self.record_batch = record_batch def to_pyarrow(self) -> pyarrow.RecordBatch: - """Convert to pyarrow ``RecordBatch``.""" + """Convert to :py:class:`pyarrow.RecordBatch`.""" return self.record_batch.to_pyarrow() class RecordBatchStream: """This class represents a stream of record batches. - These are typically the result of a ``DataFrame::execute_stream`` operation. + These are typically the result of a + :py:func:`~datafusion.dataframe.DataFrame.execute_stream` operation. """ def __init__(self, record_batch_stream: df_internal.RecordBatchStream) -> None: @@ -56,7 +58,7 @@ def __init__(self, record_batch_stream: df_internal.RecordBatchStream) -> None: self.rbs = record_batch_stream def next(self) -> RecordBatch | None: - """See ``__next__`` for the iterator function.""" + """See :py:func:`__next__` for the iterator function.""" try: next_batch = next(self) except StopIteration: diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 4b44ad19b..0cdd19a51 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -48,7 +48,8 @@ def __init__(self, plan: substrait_internal.Plan) -> None: """Create a substrait plan. The user should not have to call this constructor directly. Rather, it - should be created via `Serde` or `Producer` classes in this module. + should be created via :py:class:`Serde` or py:class:`Producer` classes + in this module. """ self.plan_internal = plan diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 4bfbabe69..12563b3d2 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -35,7 +35,7 @@ class Volatility(Enum): """Defines how stable or volatile a function is. When setting the volatility of a function, you can either pass this - enumeration or a `str`. The `str` equivalent is the lower case value of the + enumeration or a ``str``. The ``str`` equivalent is the lower case value of the name (`"immutable"`, `"stable"`, or `"volatile"`). """ @@ -52,9 +52,9 @@ class Volatility(Enum): A stable function may return different values given the same input across different queries but must return the same value for a given input within a - query. An example of this is the `Now` function. DataFusion will attempt to - inline `Stable` functions during planning, when possible. For query - `select col1, now() from t1`, it might take a while to execute but `now()` + query. An example of this is the ``Now`` function. DataFusion will attempt to + inline ``Stable`` functions during planning, when possible. For query + ``select col1, now() from t1``, it might take a while to execute but ``now()`` column will be the same for each output row, which is evaluated during planning. """ @@ -66,7 +66,7 @@ class Volatility(Enum): Multiple invocations of a volatile function may return different results when used in the same query. An example of this is the random() function. DataFusion can not evaluate such functions during planning. In the query - `select col1, random() from t1`, `random()` function will be evaluated + ``select col1, random() from t1``, ``random()`` function will be evaluated for each output row, resulting in a unique random value for each row. """ @@ -78,7 +78,7 @@ def __str__(self): class ScalarUDF: """Class for performing scalar user defined functions (UDF). - Scalar UDFs operate on a row by row basis. See also ``AggregateUDF`` for + Scalar UDFs operate on a row by row basis. See also :py:class:`AggregateUDF` for operating on a group of rows. """ @@ -92,9 +92,9 @@ def __init__( ) -> None: """Instantiate a scalar user defined function (UDF). - See helper method ``udf`` for argument details. + See helper method :py:func:`udf` for argument details. """ - self.udf = df_internal.ScalarUDF( + self._udf = df_internal.ScalarUDF( name, func, input_types, return_type, str(volatility) ) @@ -105,7 +105,7 @@ def __call__(self, *args: Expr) -> Expr: occur during the evaluation of the dataframe. """ args = [arg.expr for arg in args] - return Expr(self.udf.__call__(*args)) + return Expr(self._udf.__call__(*args)) @staticmethod def udf( @@ -119,7 +119,7 @@ def udf( Args: func: A callable python function. - input_types: The data types of the arguments to `func`. This list + input_types: The data types of the arguments to ``func``. This list must be of the same length as the number of arguments. return_type: The data type of the return value from the python function. @@ -144,7 +144,7 @@ def udf( class Accumulator(metaclass=ABCMeta): - """Defines how an `AggregateUDF` accumulates values during an evaluation.""" + """Defines how an :py:class:`AggregateUDF` accumulates values.""" @abstractmethod def state(self) -> List[pyarrow.Scalar]: @@ -175,7 +175,7 @@ class AggregateUDF: """Class for performing scalar user defined functions (UDF). Aggregate UDFs operate on a group of rows and return a single value. See - also ``ScalarUDF`` for operating on a row by row basis. + also :py:class:`ScalarUDF` for operating on a row by row basis. """ def __init__( @@ -189,10 +189,10 @@ def __init__( ) -> None: """Instantiate a user defined aggregate function (UDAF). - See ``Aggregate::udaf`` for a convenience function and arugment + See :py:func:`udaf` for a convenience function and arugment descriptions. """ - self.udf = df_internal.AggregateUDF( + self._udf = df_internal.AggregateUDF( name, accumulator, input_types, return_type, state_type, str(volatility) ) @@ -203,7 +203,7 @@ def __call__(self, *args: Expr) -> Expr: occur during the evaluation of the dataframe. """ args = [arg.expr for arg in args] - return Expr(self.udf.__call__(*args)) + return Expr(self._udf.__call__(*args)) @staticmethod def udaf( @@ -216,14 +216,14 @@ def udaf( ) -> AggregateUDF: """Create a new User Defined Aggregate Function. - The accumulator function must be callable and implement `Accumulator`. + The accumulator function must be callable and implement :py:class:`Accumulator`. Args: accum: The accumulator python function. - input_types: The data types of the arguments to `accum`. + input_types: The data types of the arguments to ``accum``. return_type: The data type of the return value. state_type: The data types of the intermediate accumulation. - volatility: See `Volatility` for allowed values. + volatility: See :py:class:`Volatility` for allowed values. name: A descriptive name for the function. Returns: From 2205e058f01f4f577846833465258bfab12b8d15 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Fri, 9 Aug 2024 08:22:45 -0500 Subject: [PATCH 011/248] Re-Enable `num_centroids` to `approx_percentile_cont` (#798) --- Cargo.lock | 164 +++++++++++--------- Cargo.toml | 10 +- python/datafusion/functions.py | 7 +- python/datafusion/tests/test_aggregation.py | 133 ++++++++-------- src/functions.rs | 10 +- 5 files changed, 172 insertions(+), 152 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c41ef771a..e59811210 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -130,9 +130,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6127ea5e585a12ec9f742232442828ebaf264dfa5eefdd71282376c599562b77" +checksum = "05048a8932648b63f21c37d88b552ccc8a65afb6dfe9fc9f30ce79174c2e7a85" dependencies = [ "arrow-arith", "arrow-array", @@ -152,9 +152,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7add7f39210b7d726e2a8efc0083e7bf06e8f2d15bdb4896b564dce4410fbf5d" +checksum = "1d8a57966e43bfe9a3277984a14c24ec617ad874e4c0e1d2a1b083a39cfbf22c" dependencies = [ "arrow-array", "arrow-buffer", @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81c16ec702d3898c2f5cfdc148443c6cd7dbe5bac28399859eb0a3d38f072827" +checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" dependencies = [ "ahash", "arrow-buffer", @@ -184,9 +184,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cae6970bab043c4fbc10aee1660ceb5b306d0c42c8cc5f6ae564efcd9759b663" +checksum = "c975484888fc95ec4a632cdc98be39c085b1bb518531b0c80c5d462063e5daa1" dependencies = [ "bytes", "half", @@ -195,9 +195,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c7ef44f26ef4f8edc392a048324ed5d757ad09135eff6d5509e6450d39e0398" +checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e" dependencies = [ "arrow-array", "arrow-buffer", @@ -216,9 +216,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f843490bd258c5182b66e888161bb6f198f49f3792f7c7f98198b924ae0f564" +checksum = "c13c36dc5ddf8c128df19bab27898eea64bf9da2b555ec1cd17a8ff57fba9ec2" dependencies = [ "arrow-array", "arrow-buffer", @@ -235,9 +235,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a769666ffac256dd301006faca1ca553d0ae7cffcf4cd07095f73f95eb226514" +checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" dependencies = [ "arrow-buffer", "arrow-schema", @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf9c3fb57390a1af0b7bb3b5558c1ee1f63905f3eccf49ae7676a8d1e6e5a72" +checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" dependencies = [ "arrow-array", "arrow-buffer", @@ -262,9 +262,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "654e7f3724176b66ddfacba31af397c48e106fbe4d281c8144e7d237df5acfd7" +checksum = "fb22284c5a2a01d73cebfd88a33511a3234ab45d66086b2ca2d1228c3498e445" dependencies = [ "arrow-array", "arrow-buffer", @@ -282,9 +282,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8008370e624e8e3c68174faaf793540287106cfda8ad1da862fdc53d8e096b4" +checksum = "42745f86b1ab99ef96d1c0bcf49180848a64fe2c7a7a0d945bc64fa2b21ba9bc" dependencies = [ "arrow-array", "arrow-buffer", @@ -297,9 +297,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca5e3a6b7fda8d9fe03f3b18a2d946354ea7f3c8e4076dbdb502ad50d9d44824" +checksum = "4cd09a518c602a55bd406bcc291a967b284cfa7a63edfbf8b897ea4748aad23c" dependencies = [ "ahash", "arrow-array", @@ -307,23 +307,22 @@ dependencies = [ "arrow-data", "arrow-schema", "half", - "hashbrown", ] [[package]] name = "arrow-schema" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dab1c12b40e29d9f3b699e0203c2a73ba558444c05e388a4377208f8f9c97eee" +checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "arrow-select" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e80159088ffe8c48965cb9b1a7c968b2729f29f37363df7eca177fc3281fe7c3" +checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" dependencies = [ "ahash", "arrow-array", @@ -335,9 +334,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fd04a6ea7de183648edbcb7a6dd925bbd04c210895f6384c780e27a9b54afcd" +checksum = "f0dc1985b67cb45f6606a248ac2b4a288849f196bab8c657ea5589f47cdd55e6" dependencies = [ "arrow-array", "arrow-buffer", @@ -365,7 +364,7 @@ dependencies = [ "tokio", "xz2", "zstd 0.13.2", - "zstd-safe 7.2.0", + "zstd-safe 7.2.1", ] [[package]] @@ -516,9 +515,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.1" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12916984aab3fa6e39d655a33e09c0071eb36d6ab3aea5c2d78551f1df6d952" +checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" [[package]] name = "bzip2" @@ -543,9 +542,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.6" +version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aba8f4e9906c7ce3c73463f62a7f0c65183ada1a2d47e397cc8810827f9694f" +checksum = "26a5c3fd7bfa1ce3897a3a3501d362b2d87b7f2583ebcb4a949ec25911025cbc" dependencies = [ "jobserver", "libc", @@ -1163,9 +1162,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" dependencies = [ "crc32fast", "miniz_oxide", @@ -1458,9 +1457,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" dependencies = [ "bytes", "futures-channel", @@ -1511,9 +1510,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "de3fc2e30ba82dd1b3911c8de1ffc143c74a914a14e99514d7637e3099df5ea0" dependencies = [ "equivalent", "hashbrown", @@ -1995,9 +1994,9 @@ dependencies = [ [[package]] name = "parquet" -version = "52.1.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f22ba0d95db56dde8685e3fadcb915cdaadda31ab8abbe3ff7f0ad1ef333267" +checksum = "e977b9066b4d3b03555c22bdc442f3fadebd96a39111249113087d0edb2691cd" dependencies = [ "ahash", "arrow-array", @@ -2181,9 +2180,12 @@ checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "prettyplease" @@ -2347,9 +2349,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4ceeeeabace7857413798eb1ffa1e9c905a9946a57d81fb69b4b71c4d8eb3ad" +checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" dependencies = [ "bytes", "pin-project-lite", @@ -2357,6 +2359,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", + "socket2", "thiserror", "tokio", "tracing", @@ -2364,9 +2367,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.3" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe" +checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" dependencies = [ "bytes", "rand", @@ -2388,6 +2391,7 @@ dependencies = [ "libc", "once_cell", "socket2", + "tracing", "windows-sys 0.52.0", ] @@ -2441,9 +2445,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.5" +version = "1.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" dependencies = [ "aho-corasick", "memchr", @@ -2558,9 +2562,9 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "1.1.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" [[package]] name = "rustc_version" @@ -2613,9 +2617,9 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.2" +version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" +checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" dependencies = [ "base64 0.22.1", "rustls-pki-types", @@ -2623,9 +2627,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] name = "rustls-webpki" @@ -2769,11 +2773,12 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.120" +version = "1.0.122" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -3020,18 +3025,19 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" [[package]] name = "target-lexicon" -version = "0.12.15" +version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4873307b7c257eddcb50c9bedf158eb669578359fb28428bef438fec8e6ba7c2" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.10.1" +version = "3.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "b8fcd239983515c23a32fb82099f97d0b11b8c72f654ed659363a95c3dad7a53" dependencies = [ "cfg-if", "fastrand", + "once_cell", "rustix", "windows-sys 0.52.0", ] @@ -3093,9 +3099,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.1" +version = "1.39.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d040ac2b29ab03b09d4129c2f5bbd012a3ac2f79d38ff506a4bf8dd34b0eac8a" +checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" dependencies = [ "backtrace", "bytes", @@ -3363,9 +3369,9 @@ dependencies = [ [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "walkdir" @@ -3483,11 +3489,11 @@ dependencies = [ [[package]] name = "winapi-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3517,6 +3523,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -3663,6 +3678,7 @@ version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] @@ -3698,7 +3714,7 @@ version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ - "zstd-safe 7.2.0", + "zstd-safe 7.2.1", ] [[package]] @@ -3713,18 +3729,18 @@ dependencies = [ [[package]] name = "zstd-safe" -version = "7.2.0" +version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa556e971e7b568dc775c136fc9de8c779b1c2fc3a63defaafadffdbd3181afa" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.11+zstd.1.5.6" +version = "2.0.12+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75652c55c0b6f3e6f12eb786fe1bc960396bf05a1eb3bf1f3691c3610ac2e6d4" +checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index d05a617a3..820118fa8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,7 @@ protoc = [ "datafusion-substrait/protoc" ] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.35", features = ["macros", "rt", "rt-multi-thread", "sync"] } +tokio = { version = "1.39", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.8" pyo3 = { version = "0.21", features = ["extension-module", "abi3", "abi3-py38"] } arrow = { version = "52", feature = ["pyarrow"] } @@ -45,17 +45,17 @@ datafusion-functions-array = "40.0.0" datafusion-optimizer = "40.0.0" datafusion-sql = "40.0.0" datafusion-substrait = { version = "40.0.0", optional = true } -prost = "0.12" -prost-types = "0.12" +prost = "0.12" # keep in line with `datafusion-substrait` +prost-types = "0.12" # keep in line with `datafusion-substrait` uuid = { version = "1.9", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } async-trait = "0.1" futures = "0.3" object_store = { version = "0.10.1", features = ["aws", "gcp", "azure"] } parking_lot = "0.12" -regex-syntax = "0.8.1" +regex-syntax = "0.8" syn = "2.0.68" -url = "2.2" +url = "2" sqlparser = "0.47.0" [build-dependencies] diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 82b5056d7..2d3d87ee0 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1473,19 +1473,18 @@ def approx_median(arg: Expr, distinct: bool = False) -> Expr: def approx_percentile_cont( expression: Expr, percentile: Expr, + num_centroids: Expr | None = None, distinct: bool = False, ) -> Expr: """Returns the value that is approximately at a given percentile of ``expr``.""" - # Re-enable num_centroids: https://github.com/apache/datafusion-python/issues/777 - num_centroids = None if num_centroids is None: return Expr( - f.approx_percentile_cont(expression.expr, percentile.expr, distinct=distinct) + f.approx_percentile_cont(expression.expr, percentile.expr, distinct=distinct, num_centroids=None) ) return Expr( f.approx_percentile_cont( - expression.expr, percentile.expr, distinct=distinct + expression.expr, percentile.expr, distinct=distinct, num_centroids=num_centroids.expr ) ) diff --git a/python/datafusion/tests/test_aggregation.py b/python/datafusion/tests/test_aggregation.py index c10e5f36c..03485da4b 100644 --- a/python/datafusion/tests/test_aggregation.py +++ b/python/datafusion/tests/test_aggregation.py @@ -39,79 +39,78 @@ def df(): ) return ctx.create_dataframe([[batch]]) +@pytest.fixture +def df_aggregate_100(): + ctx = SessionContext() + ctx.register_csv("aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv") + return ctx.table("aggregate_test_data") -def test_built_in_aggregation(df): - col_a = column("a") - col_b = column("b") - col_c = column("c") - agg_df = df.aggregate( - [], - [ - f.approx_distinct(col_b), - f.approx_median(col_b), - f.approx_percentile_cont(col_b, lit(0.5)), - f.approx_percentile_cont_with_weight(col_b, lit(0.6), lit(0.5)), - f.array_agg(col_b), - f.avg(col_a), - f.corr(col_a, col_b), - f.count(col_a), - f.covar(col_a, col_b), - f.covar_pop(col_a, col_c), - f.covar_samp(col_b, col_c), - # f.grouping(col_a), # No physical plan implemented yet - f.max(col_a), - f.mean(col_b), - f.median(col_b), - f.min(col_a), - f.sum(col_b), - f.stddev(col_a), - f.stddev_pop(col_b), - f.stddev_samp(col_c), - f.var(col_a), - f.var_pop(col_b), - f.var_samp(col_c), - ], - ) +@pytest.mark.parametrize("agg_expr, calc_expected", [ + (f.avg(column("a")), lambda a, b, c, d: np.array(np.average(a))), + (f.corr(column("a"), column("b")), lambda a, b, c, d: np.array(np.corrcoef(a, b)[0][1])), + (f.count(column("a")), lambda a, b, c, d: pa.array([len(a)])), + # Sample (co)variance -> ddof=1 + # Population (co)variance -> ddof=0 + (f.covar(column("a"), column("b")), lambda a, b, c, d: np.array(np.cov(a, b, ddof=1)[0][1])), + (f.covar_pop(column("a"), column("c")), lambda a, b, c, d: np.array(np.cov(a, c, ddof=0)[0][1])), + (f.covar_samp(column("b"), column("c")), lambda a, b, c, d: np.array(np.cov(b, c, ddof=1)[0][1])), + # f.grouping(col_a), # No physical plan implemented yet + (f.max(column("a")), lambda a, b, c, d: np.array(np.max(a))), + (f.mean(column("b")), lambda a, b, c, d: np.array(np.mean(b))), + (f.median(column("b")), lambda a, b, c, d: np.array(np.median(b))), + (f.min(column("a")), lambda a, b, c, d: np.array(np.min(a))), + (f.sum(column("b")), lambda a, b, c, d: np.array(np.sum(b.to_pylist()))), + # Sample stdev -> ddof=1 + # Population stdev -> ddof=0 + (f.stddev(column("a")), lambda a, b, c, d: np.array(np.std(a, ddof=1))), + (f.stddev_pop(column("b")), lambda a, b, c, d: np.array(np.std(b, ddof=0))), + (f.stddev_samp(column("c")), lambda a, b, c, d: np.array(np.std(c, ddof=1))), + (f.var(column("a")), lambda a, b, c, d: np.array(np.var(a, ddof=1))), + (f.var_pop(column("b")), lambda a, b, c, d: np.array(np.var(b, ddof=0))), + (f.var_samp(column("c")), lambda a, b, c, d: np.array(np.var(c, ddof=1))), +]) +def test_aggregation_stats(df, agg_expr, calc_expected): + + agg_df = df.aggregate([], [agg_expr]) result = agg_df.collect()[0] values_a, values_b, values_c, values_d = df.collect()[0] + expected = calc_expected(values_a, values_b, values_c, values_d) + np.testing.assert_array_almost_equal(result.column(0), expected) + + +@pytest.mark.parametrize("agg_expr, expected", [ + (f.approx_distinct(column("b")), pa.array([2], type=pa.uint64())), + (f.approx_median(column("b")), pa.array([4])), + (f.approx_percentile_cont(column("b"), lit(0.5)), pa.array([4])), + ( + f.approx_percentile_cont_with_weight(column("b"), lit(0.6), lit(0.5)), + pa.array([6], type=pa.float64()) + ), + (f.array_agg(column("b")), pa.array([[4, 4, 6]])), +]) +def test_aggregation(df, agg_expr, expected): + agg_df = df.aggregate([], [agg_expr]) + result = agg_df.collect()[0] + assert result.column(0) == expected - assert result.column(0) == pa.array([2], type=pa.uint64()) - assert result.column(1) == pa.array([4]) - assert result.column(2) == pa.array([4]) - # Ref: https://github.com/apache/datafusion-python/issues/777 - # assert result.column(3) == pa.array([6]) - assert result.column(4) == pa.array([[4, 4, 6]]) - np.testing.assert_array_almost_equal(result.column(5), np.average(values_a)) - np.testing.assert_array_almost_equal( - result.column(6), np.corrcoef(values_a, values_b)[0][1] - ) - assert result.column(7) == pa.array([len(values_a)]) - # Sample (co)variance -> ddof=1 - # Population (co)variance -> ddof=0 - np.testing.assert_array_almost_equal( - result.column(8), np.cov(values_a, values_b, ddof=1)[0][1] - ) - np.testing.assert_array_almost_equal( - result.column(9), np.cov(values_a, values_c, ddof=0)[0][1] - ) - np.testing.assert_array_almost_equal( - result.column(10), np.cov(values_b, values_c, ddof=1)[0][1] - ) - np.testing.assert_array_almost_equal(result.column(11), np.max(values_a)) - np.testing.assert_array_almost_equal(result.column(12), np.mean(values_b)) - np.testing.assert_array_almost_equal(result.column(13), np.median(values_b)) - np.testing.assert_array_almost_equal(result.column(14), np.min(values_a)) - np.testing.assert_array_almost_equal( - result.column(15), np.sum(values_b.to_pylist()) - ) - np.testing.assert_array_almost_equal(result.column(16), np.std(values_a, ddof=1)) - np.testing.assert_array_almost_equal(result.column(17), np.std(values_b, ddof=0)) - np.testing.assert_array_almost_equal(result.column(18), np.std(values_c, ddof=1)) - np.testing.assert_array_almost_equal(result.column(19), np.var(values_a, ddof=1)) - np.testing.assert_array_almost_equal(result.column(20), np.var(values_b, ddof=0)) - np.testing.assert_array_almost_equal(result.column(21), np.var(values_c, ddof=1)) +def test_aggregate_100(df_aggregate_100): + # https://github.com/apache/datafusion/blob/bddb6415a50746d2803dd908d19c3758952d74f9/datafusion/sqllogictest/test_files/aggregate.slt#L1490-L1498 + + result = df_aggregate_100.aggregate( + [ + column("c1") + ], + [ + f.approx_percentile_cont(column("c3"), lit(0.95), lit(200)).alias("c3") + ] + ).sort(column("c1").sort(ascending=True)).collect() + + assert len(result) == 1 + result = result[0] + assert result.column("c1") == pa.array(["a", "b", "c", "d", "e"]) + assert result.column("c3") == pa.array([73, 68, 122, 124, 115]) def test_bit_add_or_xor(df): df = df.aggregate( diff --git a/src/functions.rs b/src/functions.rs index e60c63c8e..f8f478166 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -57,9 +57,15 @@ pub fn approx_percentile_cont( expression: PyExpr, percentile: PyExpr, distinct: bool, + num_centroids: Option, // enforces optional arguments at the end, currently ) -> PyResult { - let expr = - functions_aggregate::expr_fn::approx_percentile_cont(expression.expr, percentile.expr); + let args = if let Some(num_centroids) = num_centroids { + vec![expression.expr, percentile.expr, num_centroids.expr] + } else { + vec![expression.expr, percentile.expr] + }; + let udaf = functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf(); + let expr = udaf.call(args); if distinct { Ok(expr.distinct().build()?.into()) } else { From b6f06f760f8f4048a91538cbde15a4ab813f62ca Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 9 Aug 2024 09:23:09 -0400 Subject: [PATCH 012/248] UDAF process all state variables (#799) --- .../common-operations/udf-and-udfa.rst | 5 +++-- python/datafusion/tests/test_udaf.py | 4 ++-- python/datafusion/udf.py | 2 +- src/udaf.rs | 19 +++++++++++-------- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst index 54c685794..e9c142f0a 100644 --- a/docs/source/user-guide/common-operations/udf-and-udfa.rst +++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst @@ -50,6 +50,7 @@ Additionally the :py:func:`~datafusion.udf.AggregateUDF.udaf` function allows yo import pyarrow.compute import datafusion from datafusion import col, udaf, Accumulator + from typing import List class MyAccumulator(Accumulator): """ @@ -62,9 +63,9 @@ Additionally the :py:func:`~datafusion.udf.AggregateUDF.udaf` function allows yo # not nice since pyarrow scalars can't be summed yet. This breaks on `None` self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) - def merge(self, states: pyarrow.Array) -> None: + def merge(self, states: List[pyarrow.Array]) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states[0]).as_py()) def state(self) -> pyarrow.Array: return pyarrow.array([self._sum.as_py()]) diff --git a/python/datafusion/tests/test_udaf.py b/python/datafusion/tests/test_udaf.py index 81194927c..76488e19b 100644 --- a/python/datafusion/tests/test_udaf.py +++ b/python/datafusion/tests/test_udaf.py @@ -38,10 +38,10 @@ def update(self, values: pa.Array) -> None: # This breaks on `None` self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - def merge(self, states: pa.Array) -> None: + def merge(self, states: List[pa.Array]) -> None: # Not nice since pyarrow scalars can't be summed yet. # This breaks on `None` - self._sum = pa.scalar(self._sum.as_py() + pc.sum(states).as_py()) + self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) def evaluate(self) -> pa.Scalar: return self._sum diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 12563b3d2..bdbad661a 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -157,7 +157,7 @@ def update(self, values: pyarrow.Array) -> None: pass @abstractmethod - def merge(self, states: pyarrow.Array) -> None: + def merge(self, states: List[pyarrow.Array]) -> None: """Merge a set of states.""" pass diff --git a/src/udaf.rs b/src/udaf.rs index 7b5e03668..2041e5a74 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -72,18 +72,21 @@ impl Accumulator for RustAccumulator { fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { Python::with_gil(|py| { - let state = &states[0]; - - // 1. cast states to Pyarrow array - let state = state - .into_data() - .to_pyarrow(py) - .map_err(|e| DataFusionError::Execution(format!("{e}")))?; + // // 1. cast states to Pyarrow arrays + let py_states: Result> = states + .iter() + .map(|state| { + state + .into_data() + .to_pyarrow(py) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) + }) + .collect(); // 2. call merge self.accum .bind(py) - .call_method1("merge", (state,)) + .call_method1("merge", (py_states?,)) .map_err(|e| DataFusionError::Execution(format!("{e}")))?; Ok(()) From e206a0cbfa54e13fa9008d7217c1843ab80cb788 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 9 Aug 2024 08:22:02 -0600 Subject: [PATCH 013/248] chore: Prepare for 40.0.0 release (#801) * use new changelog script from DataFusion * Update changelog --- CHANGELOG.md | 35 ++++++++++ dev/release/README.md | 2 +- dev/release/generate-changelog.py | 111 +++++++++++++++++++++--------- requirements.in | 1 + 4 files changed, 115 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32a74d825..305af5720 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,41 @@ # DataFusion Python Changelog +## [40.0.0](https://github.com/apache/datafusion-python/tree/40.0.0) (2024-08-09) + +This release consists of 18 commits from 4 contributors. See credits at the end of this changelog for more information. + +- Update changelog for 39.0.0 [#742](https://github.com/apache/datafusion-python/pull/742) (andygrove) +- build(deps): bump uuid from 1.8.0 to 1.9.1 [#744](https://github.com/apache/datafusion-python/pull/744) (dependabot[bot]) +- build(deps): bump mimalloc from 0.1.42 to 0.1.43 [#745](https://github.com/apache/datafusion-python/pull/745) (dependabot[bot]) +- build(deps): bump syn from 2.0.67 to 2.0.68 [#746](https://github.com/apache/datafusion-python/pull/746) (dependabot[bot]) +- Tsaucer/find window fn [#747](https://github.com/apache/datafusion-python/pull/747) (timsaucer) +- Python wrapper classes for all user interfaces [#750](https://github.com/apache/datafusion-python/pull/750) (timsaucer) +- Expose array sort [#764](https://github.com/apache/datafusion-python/pull/764) (timsaucer) +- Upgrade protobuf and remove GH Action googletest-installer [#773](https://github.com/apache/datafusion-python/pull/773) (Michael-J-Ward) +- Upgrade Datafusion 40 [#771](https://github.com/apache/datafusion-python/pull/771) (Michael-J-Ward) +- Bugfix: Calling count with None arguments [#768](https://github.com/apache/datafusion-python/pull/768) (timsaucer) +- Add in user example that compares a two different approaches to UDFs [#770](https://github.com/apache/datafusion-python/pull/770) (timsaucer) +- Add missing exports for wrapper modules [#782](https://github.com/apache/datafusion-python/pull/782) (timsaucer) +- Add PyExpr to_variant conversions [#793](https://github.com/apache/datafusion-python/pull/793) (Michael-J-Ward) +- Add missing expressions to wrapper export [#795](https://github.com/apache/datafusion-python/pull/795) (timsaucer) +- Doc/cross reference [#791](https://github.com/apache/datafusion-python/pull/791) (timsaucer) +- Re-Enable `num_centroids` to `approx_percentile_cont` [#798](https://github.com/apache/datafusion-python/pull/798) (Michael-J-Ward) +- UDAF process all state variables [#799](https://github.com/apache/datafusion-python/pull/799) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 9 Tim Saucer + 4 Michael J Ward + 3 dependabot[bot] + 2 Andy Grove +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + ## [39.0.0](https://github.com/apache/datafusion-python/tree/39.0.0) (2024-06-25) **Merged pull requests:** diff --git a/dev/release/README.md b/dev/release/README.md index c4372c832..93c2f97b9 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -63,7 +63,7 @@ We maintain a `CHANGELOG.md` so our users know what has been changed between rel The changelog is generated using a Python script: ```bash -$ GITHUB_TOKEN= ./dev/release/generate-changelog.py apache/datafusion-python 24.0.0 HEAD > dev/changelog/25.0.0.md +$ GITHUB_TOKEN= ./dev/release/generate-changelog.py 24.0.0 HEAD 25.0.0 > dev/changelog/25.0.0.md ``` This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index af097ce98..5645d2f74 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -20,23 +20,20 @@ from github import Github import os import re - +import subprocess def print_pulls(repo_name, title, pulls): - if len(pulls) > 0: + if len(pulls) > 0: print("**{}:**".format(title)) print() - for pull, commit in pulls: + for (pull, commit) in pulls: url = "https://github.com/{}/pull/{}".format(repo_name, pull.number) - print( - "- {} [#{}]({}) ({})".format( - pull.title, pull.number, url, commit.author.login - ) - ) + print("- {} [#{}]({}) ({})".format(pull.title, pull.number, url, commit.author.login)) print() -def generate_changelog(repo, repo_name, tag1, tag2): +def generate_changelog(repo, repo_name, tag1, tag2, version): + # get a list of commits between two tags print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr) comparison = repo.compare(tag1, tag2) @@ -55,45 +52,95 @@ def generate_changelog(repo, repo_name, tag1, tag2): all_pulls.append((pull, commit)) # we split the pulls into categories - # TODO: make categories configurable breaking = [] bugs = [] docs = [] enhancements = [] + performance = [] + other = [] # categorize the pull requests based on GitHub labels print("Categorizing pull requests", file=sys.stderr) - for pull, commit in all_pulls: + for (pull, commit) in all_pulls: + # see if PR title uses Conventional Commits - cc_type = "" - # cc_scope = '' - cc_breaking = "" - parts = re.findall(r"^([a-z]+)(\([a-z]+\))?(!)?:", pull.title) + cc_type = '' + cc_scope = '' + cc_breaking = '' + parts = re.findall(r'^([a-z]+)(\([a-z]+\))?(!)?:', pull.title) if len(parts) == 1: parts_tuple = parts[0] - cc_type = parts_tuple[0] # fix, feat, docs, chore - # cc_scope = parts_tuple[1] # component within project - cc_breaking = parts_tuple[2] == "!" + cc_type = parts_tuple[0] # fix, feat, docs, chore + cc_scope = parts_tuple[1] # component within project + cc_breaking = parts_tuple[2] == '!' labels = [label.name for label in pull.labels] - # print(pull.number, labels, parts, file=sys.stderr) - if "api change" in labels or cc_breaking: + if 'api change' in labels or cc_breaking: breaking.append((pull, commit)) - elif "bug" in labels or cc_type == "fix": + elif 'bug' in labels or cc_type == 'fix': bugs.append((pull, commit)) - elif "enhancement" in labels or cc_type == "feat": + elif 'performance' in labels or cc_type == 'perf': + performance.append((pull, commit)) + elif 'enhancement' in labels or cc_type == 'feat': enhancements.append((pull, commit)) - elif "documentation" in labels or cc_type == "docs": + elif 'documentation' in labels or cc_type == 'docs' or cc_type == 'doc': docs.append((pull, commit)) + else: + other.append((pull, commit)) # produce the changelog content print("Generating changelog content", file=sys.stderr) + + # ASF header + print("""\n""") + + print(f"# Apache DataFusion Python {version} Changelog\n") + + # get the number of commits + commit_count = subprocess.check_output(f"git log --pretty=oneline {tag1}..{tag2} | wc -l", shell=True, text=True).strip() + + # get number of contributors + contributor_count = subprocess.check_output(f"git shortlog -sn {tag1}..{tag2} | wc -l", shell=True, text=True).strip() + + print(f"This release consists of {commit_count} commits from {contributor_count} contributors. " + f"See credits at the end of this changelog for more information.\n") + print_pulls(repo_name, "Breaking changes", breaking) + print_pulls(repo_name, "Performance related", performance) print_pulls(repo_name, "Implemented enhancements", enhancements) print_pulls(repo_name, "Fixed bugs", bugs) print_pulls(repo_name, "Documentation updates", docs) - print_pulls(repo_name, "Merged pull requests", all_pulls) + print_pulls(repo_name, "Other", other) + # show code contributions + credits = subprocess.check_output(f"git shortlog -sn {tag1}..{tag2}", shell=True, text=True).rstrip() + + print("## Credits\n") + print("Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) " + "per contributor.\n") + print("```") + print(credits) + print("```\n") + + print("Thank you also to everyone who contributed in other ways such as filing issues, reviewing " + "PRs, and providing feedback on this release.\n") def cli(args=None): """Process command line arguments.""" @@ -101,19 +148,17 @@ def cli(args=None): args = sys.argv[1:] parser = argparse.ArgumentParser() - parser.add_argument( - "project", help="The project name e.g. apache/datafusion-python" - ) - parser.add_argument("tag1", help="The previous release tag") - parser.add_argument("tag2", help="The current release tag") + parser.add_argument("tag1", help="The previous commit or tag (e.g. 0.1.0)") + parser.add_argument("tag2", help="The current commit or tag (e.g. HEAD)") + parser.add_argument("version", help="The version number to include in the changelog") args = parser.parse_args() token = os.getenv("GITHUB_TOKEN") + project = "apache/datafusion-python" g = Github(token) - repo = g.get_repo(args.project) - generate_changelog(repo, args.project, args.tag1, args.tag2) - + repo = g.get_repo(project) + generate_changelog(repo, project, args.tag1, args.tag2, args.version) if __name__ == "__main__": - cli() + cli() \ No newline at end of file diff --git a/requirements.in b/requirements.in index b2a1a48df..1b7f62052 100644 --- a/requirements.in +++ b/requirements.in @@ -23,3 +23,4 @@ pytest ruff toml importlib_metadata; python_version < "3.8" +PyGitHub From b2982ecda723f77c2b0b343b030fd3fe589b2062 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 13 Aug 2024 12:11:29 -0400 Subject: [PATCH 014/248] Add typing-extensions dependency to pyproject (#805) * Add typing-extensions dependency to pyproject * Specify versions which require typing extensions. Co-authored-by: Kyle Barron --------- Co-authored-by: Kyle Barron --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index a18ef0e5e..4e03ce8db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ classifier = [ ] dependencies = [ "pyarrow>=11.0.0", + "typing-extensions;python_version<'3.13'", ] [project.urls] From 805183bd9ae32b628e641d9552f1d0dded4a7a85 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Thu, 22 Aug 2024 03:53:30 +0200 Subject: [PATCH 015/248] feat: enable list of paths for read_csv (#824) --- python/datafusion/context.py | 7 +++++-- python/datafusion/tests/test_context.py | 16 ++++++++++++++++ src/context.rs | 15 +++++++-------- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 922cc87a3..47f2b9cf9 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -883,7 +883,7 @@ def read_json( def read_csv( self, - path: str | pathlib.Path, + path: str | pathlib.Path | list[str] | list[pathlib.Path], schema: pyarrow.Schema | None = None, has_header: bool = True, delimiter: str = ",", @@ -914,9 +914,12 @@ def read_csv( """ if table_partition_cols is None: table_partition_cols = [] + + path = [str(p) for p in path] if isinstance(path, list) else str(path) + return DataFrame( self.ctx.read_csv( - str(path), + path, schema, has_header, delimiter, diff --git a/python/datafusion/tests/test_context.py b/python/datafusion/tests/test_context.py index 8373659b0..1b424db8b 100644 --- a/python/datafusion/tests/test_context.py +++ b/python/datafusion/tests/test_context.py @@ -484,6 +484,22 @@ def test_read_csv(ctx): csv_df.select(column("c1")).show() +def test_read_csv_list(ctx): + csv_df = ctx.read_csv(path=["testing/data/csv/aggregate_test_100.csv"]) + expected = csv_df.count() * 2 + + double_csv_df = ctx.read_csv( + path=[ + "testing/data/csv/aggregate_test_100.csv", + "testing/data/csv/aggregate_test_100.csv", + ] + ) + actual = double_csv_df.count() + + double_csv_df.select(column("c1")).show() + assert actual == expected + + def test_read_csv_compressed(ctx, tmp_path): test_data_path = "testing/data/csv/aggregate_test_100.csv" diff --git a/src/context.rs b/src/context.rs index 50c4a1994..d7890e3f5 100644 --- a/src/context.rs +++ b/src/context.rs @@ -805,7 +805,7 @@ impl PySessionContext { file_compression_type=None))] pub fn read_csv( &self, - path: PathBuf, + path: &Bound<'_, PyAny>, schema: Option>, has_header: bool, delimiter: &str, @@ -815,10 +815,6 @@ impl PySessionContext { file_compression_type: Option, py: Python, ) -> PyResult { - let path = path - .to_str() - .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; - let delimiter = delimiter.as_bytes(); if delimiter.len() != 1 { return Err(PyValueError::new_err( @@ -833,13 +829,16 @@ impl PySessionContext { .file_extension(file_extension) .table_partition_cols(convert_table_partition_cols(table_partition_cols)?) .file_compression_type(parse_file_compression_type(file_compression_type)?); + options.schema = schema.as_ref().map(|x| &x.0); - if let Some(py_schema) = schema { - options.schema = Some(&py_schema.0); - let result = self.ctx.read_csv(path, options); + if path.is_instance_of::() { + let paths = path.extract::>()?; + let paths = paths.iter().map(|p| p as &str).collect::>(); + let result = self.ctx.read_csv(paths, options); let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?); Ok(df) } else { + let path = path.extract::()?; let result = self.ctx.read_csv(path, options); let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?); Ok(df) From 3b5085e0c716298af7c8f025b8b9edf601344f02 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Fri, 23 Aug 2024 09:43:39 -0500 Subject: [PATCH 016/248] Upgrade deps to datafusion 41 (#802) * update datafusion deps to point to githuc.com/apache/datafusion Datafusion 41 is not yet released on crates.io. * update TableProvider::scan Ref: https://github.com/apache/datafusion/pull/11516 * use SessionStateBuilder The old constructor is deprecated. Ref: https://github.com/apache/datafusion/pull/11403 * update AggregateFunction Upstream Changes: - The field name was switched from `func_name` to func. - AggregateFunctionDefinition was removed Ref: https://github.com/apache/datafusion/pull/11803 * update imports in catalog Catlog API was extracted to a separate crate. Ref: https://github.com/apache/datafusion/pull/11516 * use appropriate path for approx_distinct Ref: https://github.com/apache/datafusion/pull/11644 * migrate AggregateExt to ExprFunctionExt Also removed `sqlparser` dependency since it's re-exported upstream. Ref: https://github.com/apache/datafusion/pull/11550 * update regr_count tests for new return type Ref: https://github.com/apache/datafusion/pull/11731 * migrate from function-array to functions-nested The package was renamed upstream. Ref: https://github.com/apache/datafusion/pull/11602 * cargo fmt * lock datafusion deps to 41 * remove todo from cargo.toml All the datafusion dependencies are re-exported, but I still need to figure out *why*. --- Cargo.lock | 108 ++++++++++++++-------- Cargo.toml | 15 ++- python/datafusion/tests/test_functions.py | 6 +- src/catalog.rs | 2 +- src/common/data_type.rs | 17 ++-- src/context.rs | 11 ++- src/dataset.rs | 4 +- src/expr/aggregate.rs | 10 +- src/expr/aggregate_expr.rs | 4 +- src/functions.rs | 44 ++++----- 10 files changed, 126 insertions(+), 95 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e59811210..bca4bf066 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -680,6 +680,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "crunchy" version = "0.2.2" @@ -725,11 +731,12 @@ checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" [[package]] name = "dashmap" -version = "5.5.3" +version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +checksum = "804c8821570c3f8b70230c2ba75ffa5c0f9a4189b9a432b6656c536712acae28" dependencies = [ "cfg-if", + "crossbeam-utils", "hashbrown", "lock_api", "once_cell", @@ -738,9 +745,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab9d55a9cd2634818953809f75ebe5248b00dd43c3227efb2a51a2d5feaad54e" +checksum = "e4fd4a99fc70d40ef7e52b243b4a399c3f8d353a40d5ecb200deee05e49c61bb" dependencies = [ "ahash", "apache-avro", @@ -754,16 +761,18 @@ dependencies = [ "bzip2", "chrono", "dashmap", + "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", - "datafusion-functions-array", + "datafusion-functions-nested", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", + "datafusion-physical-optimizer", "datafusion-physical-plan", "datafusion-sql", "flate2", @@ -792,11 +801,25 @@ dependencies = [ "zstd 0.13.2", ] +[[package]] +name = "datafusion-catalog" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b3cfbd84c6003594ae1972314e3df303a27ce8ce755fcea3240c90f4c0529" +dependencies = [ + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", +] + [[package]] name = "datafusion-common" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "def66b642959e7f96f5d2da22e1f43d3bd35598f821e5ce351a0553e0f1b7367" +checksum = "44fdbc877e3e40dcf88cc8f283d9f5c8851f0a3aa07fee657b1b75ac1ad49b9c" dependencies = [ "ahash", "apache-avro", @@ -818,18 +841,18 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f104bb9cb44c06c9badf8a0d7e0855e5f7fa5e395b887d7f835e8a9457dc1352" +checksum = "8a7496d1f664179f6ce3a5cbef6566056ccaf3ea4aa72cc455f80e62c1dd86b1" dependencies = [ "tokio", ] [[package]] name = "datafusion-execution" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ac0fd8b5d80bbca3fc3b6f40da4e9f6907354824ec3b18bbd83fee8cf5c3c3e" +checksum = "799e70968c815b611116951e3dd876aef04bf217da31b72eec01ee6a959336a1" dependencies = [ "arrow", "chrono", @@ -848,9 +871,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2103d2cc16fb11ef1fa993a6cac57ed5cb028601db4b97566c90e5fa77aa1e68" +checksum = "1c1841c409d9518c17971d15c9bae62e629eb937e6fb6c68cd32e9186f8b30d2" dependencies = [ "ahash", "arrow", @@ -867,11 +890,12 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a369332afd0ef5bd565f6db2139fb9f1dfdd0afa75a7f70f000b74208d76994f" +checksum = "a8e481cf34d2a444bd8fa09b65945f0ce83dc92df8665b761505b3d9f351bebb" dependencies = [ "arrow", + "arrow-buffer", "base64 0.22.1", "blake2", "blake3", @@ -893,9 +917,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92718db1aff70c47e5abf9fc975768530097059e5db7c7b78cd64b5e9a11fc77" +checksum = "2b4ece19f73c02727e5e8654d79cd5652de371352c1df3c4ac3e419ecd6943fb" dependencies = [ "ahash", "arrow", @@ -910,10 +934,10 @@ dependencies = [ ] [[package]] -name = "datafusion-functions-array" -version = "40.0.0" +name = "datafusion-functions-nested" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30bb80f46ff3dcf4bb4510209c2ba9b8ce1b716ac8b7bf70c6bf7dca6260c831" +checksum = "a1474552cc824e8c9c88177d454db5781d4b66757d4aca75719306b8343a5e8d" dependencies = [ "arrow", "arrow-array", @@ -928,13 +952,14 @@ dependencies = [ "itertools 0.12.1", "log", "paste", + "rand", ] [[package]] name = "datafusion-optimizer" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82f34692011bec4fdd6fc18c264bf8037b8625d801e6dd8f5111af15cb6d71d3" +checksum = "791ff56f55608bc542d1ea7a68a64bdc86a9413f5a381d06a39fd49c2a3ab906" dependencies = [ "arrow", "async-trait", @@ -952,9 +977,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45538630defedb553771434a437f7ca8f04b9b3e834344aafacecb27dc65d5e5" +checksum = "9a223962b3041304a3e20ed07a21d5de3d88d7e4e71ca192135db6d24e3365a4" dependencies = [ "ahash", "arrow", @@ -982,9 +1007,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d8a72b0ca908e074aaeca52c14ddf5c28d22361e9cb6bc79bb733cd6661b536" +checksum = "db5e7d8532a1601cd916881db87a70b0a599900d23f3db2897d389032da53bc6" dependencies = [ "ahash", "arrow", @@ -994,11 +1019,23 @@ dependencies = [ "rand", ] +[[package]] +name = "datafusion-physical-optimizer" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb9c78f308e050f5004671039786a925c3fee83b90004e9fcfd328d7febdcc0" +dependencies = [ + "datafusion-common", + "datafusion-execution", + "datafusion-physical-expr", + "datafusion-physical-plan", +] + [[package]] name = "datafusion-physical-plan" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b504eae6107a342775e22e323e9103f7f42db593ec6103b28605b7b7b1405c4a" +checksum = "8d1116949432eb2d30f6362707e2846d942e491052a206f2ddcb42d08aea1ffe" dependencies = [ "ahash", "arrow", @@ -1037,7 +1074,7 @@ dependencies = [ "datafusion", "datafusion-common", "datafusion-expr", - "datafusion-functions-array", + "datafusion-functions-nested", "datafusion-optimizer", "datafusion-sql", "datafusion-substrait", @@ -1051,7 +1088,6 @@ dependencies = [ "pyo3-build-config", "rand", "regex-syntax", - "sqlparser", "syn 2.0.72", "tokio", "url", @@ -1060,9 +1096,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5db33f323f41b95ae201318ba654a9bf11113e58a51a1dff977b1a836d3d889" +checksum = "b45d0180711165fe94015d7c4123eb3e1cf5fb60b1506453200b8d1ce666bef0" dependencies = [ "arrow", "arrow-array", @@ -1077,9 +1113,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "40.0.0" +version = "41.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "434e52fbff22e6e04e6c787f603a6aba4961a7e249a29c743c5d4f609ec2dcef" +checksum = "bf0a0055aa98246c79f98f0d03df11f16cb7adc87818d02d4413e3f3cdadbbee" dependencies = [ "arrow-buffer", "async-recursion", @@ -2898,9 +2934,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.47.0" +version = "0.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "295e9930cd7a97e58ca2a070541a3ca502b17f5d1fa7157376d0fabd85324f25" +checksum = "a4a404d0e14905361b918cb8afdb73605e25c1d5029312bd9785142dcb3aa49e" dependencies = [ "log", "sqlparser_derive", diff --git a/Cargo.toml b/Cargo.toml index 820118fa8..8881884b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,13 +38,13 @@ tokio = { version = "1.39", features = ["macros", "rt", "rt-multi-thread", "sync rand = "0.8" pyo3 = { version = "0.21", features = ["extension-module", "abi3", "abi3-py38"] } arrow = { version = "52", feature = ["pyarrow"] } -datafusion = { version = "40.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } -datafusion-common = { version = "40.0.0", features = ["pyarrow"] } -datafusion-expr = "40.0.0" -datafusion-functions-array = "40.0.0" -datafusion-optimizer = "40.0.0" -datafusion-sql = "40.0.0" -datafusion-substrait = { version = "40.0.0", optional = true } +datafusion = { version = "41.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } +datafusion-common = { version = "41.0.0", features = ["pyarrow"] } +datafusion-expr = { version = "41.0.0" } +datafusion-functions-nested = { version = "41.0.0" } +datafusion-optimizer = { version = "41.0.0" } +datafusion-sql = { version = "41.0.0" } +datafusion-substrait = { version = "41.0.0", optional = true } prost = "0.12" # keep in line with `datafusion-substrait` prost-types = "0.12" # keep in line with `datafusion-substrait` uuid = { version = "1.9", features = ["v4"] } @@ -56,7 +56,6 @@ parking_lot = "0.12" regex-syntax = "0.8" syn = "2.0.68" url = "2" -sqlparser = "0.47.0" [build-dependencies] pyo3-build-config = "0.21" diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index 293912321..b8ad9c0d7 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -808,7 +808,7 @@ def test_regr_funcs_sql(df): assert result[0].column(0) == pa.array([None], type=pa.float64()) assert result[0].column(1) == pa.array([None], type=pa.float64()) - assert result[0].column(2) == pa.array([1], type=pa.float64()) + assert result[0].column(2) == pa.array([1], type=pa.uint64()) assert result[0].column(3) == pa.array([None], type=pa.float64()) assert result[0].column(4) == pa.array([1], type=pa.float64()) assert result[0].column(5) == pa.array([1], type=pa.float64()) @@ -840,7 +840,7 @@ def test_regr_funcs_sql_2(): # Assertions for SQL results assert result_sql[0].column(0) == pa.array([2], type=pa.float64()) assert result_sql[0].column(1) == pa.array([0], type=pa.float64()) - assert result_sql[0].column(2) == pa.array([3], type=pa.float64()) # todo: i would not expect this to be float + assert result_sql[0].column(2) == pa.array([3], type=pa.uint64()) assert result_sql[0].column(3) == pa.array([1], type=pa.float64()) assert result_sql[0].column(4) == pa.array([2], type=pa.float64()) assert result_sql[0].column(5) == pa.array([4], type=pa.float64()) @@ -852,7 +852,7 @@ def test_regr_funcs_sql_2(): @pytest.mark.parametrize("func, expected", [ pytest.param(f.regr_slope, pa.array([2], type=pa.float64()), id="regr_slope"), pytest.param(f.regr_intercept, pa.array([0], type=pa.float64()), id="regr_intercept"), - pytest.param(f.regr_count, pa.array([3], type=pa.float64()), id="regr_count"), # TODO: I would expect this to return an int array + pytest.param(f.regr_count, pa.array([3], type=pa.uint64()), id="regr_count"), pytest.param(f.regr_r2, pa.array([1], type=pa.float64()), id="regr_r2"), pytest.param(f.regr_avgx, pa.array([2], type=pa.float64()), id="regr_avgx"), pytest.param(f.regr_avgy, pa.array([4], type=pa.float64()), id="regr_avgy"), diff --git a/src/catalog.rs b/src/catalog.rs index 49fe14046..1ce66a4dc 100644 --- a/src/catalog.rs +++ b/src/catalog.rs @@ -25,7 +25,7 @@ use crate::errors::DataFusionError; use crate::utils::wait_for_future; use datafusion::{ arrow::pyarrow::ToPyArrow, - catalog::{schema::SchemaProvider, CatalogProvider}, + catalog::{CatalogProvider, SchemaProvider}, datasource::{TableProvider, TableType}, }; diff --git a/src/common/data_type.rs b/src/common/data_type.rs index 469bb789a..21b085c0e 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -18,6 +18,7 @@ use datafusion::arrow::array::Array; use datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; use datafusion_common::{DataFusionError, ScalarValue}; +use datafusion_expr::sqlparser::ast::NullTreatment as DFNullTreatment; use pyo3::{exceptions::PyValueError, prelude::*}; use crate::errors::py_datafusion_err; @@ -775,20 +776,20 @@ pub enum NullTreatment { RESPECT_NULLS, } -impl From for sqlparser::ast::NullTreatment { - fn from(null_treatment: NullTreatment) -> sqlparser::ast::NullTreatment { +impl From for DFNullTreatment { + fn from(null_treatment: NullTreatment) -> DFNullTreatment { match null_treatment { - NullTreatment::IGNORE_NULLS => sqlparser::ast::NullTreatment::IgnoreNulls, - NullTreatment::RESPECT_NULLS => sqlparser::ast::NullTreatment::RespectNulls, + NullTreatment::IGNORE_NULLS => DFNullTreatment::IgnoreNulls, + NullTreatment::RESPECT_NULLS => DFNullTreatment::RespectNulls, } } } -impl From for NullTreatment { - fn from(null_treatment: sqlparser::ast::NullTreatment) -> NullTreatment { +impl From for NullTreatment { + fn from(null_treatment: DFNullTreatment) -> NullTreatment { match null_treatment { - sqlparser::ast::NullTreatment::IgnoreNulls => NullTreatment::IGNORE_NULLS, - sqlparser::ast::NullTreatment::RespectNulls => NullTreatment::RESPECT_NULLS, + DFNullTreatment::IgnoreNulls => NullTreatment::IGNORE_NULLS, + DFNullTreatment::RespectNulls => NullTreatment::RESPECT_NULLS, } } } diff --git a/src/context.rs b/src/context.rs index d7890e3f5..a43599cf7 100644 --- a/src/context.rs +++ b/src/context.rs @@ -20,6 +20,7 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; +use datafusion::execution::session_state::SessionStateBuilder; use object_store::ObjectStore; use url::Url; use uuid::Uuid; @@ -49,9 +50,7 @@ use datafusion::datasource::listing::{ }; use datafusion::datasource::MemTable; use datafusion::datasource::TableProvider; -use datafusion::execution::context::{ - SQLOptions, SessionConfig, SessionContext, SessionState, TaskContext, -}; +use datafusion::execution::context::{SQLOptions, SessionConfig, SessionContext, TaskContext}; use datafusion::execution::disk_manager::DiskManagerConfig; use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool, UnboundedMemoryPool}; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; @@ -281,7 +280,11 @@ impl PySessionContext { RuntimeConfig::default() }; let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); - let session_state = SessionState::new_with_config_rt(config, runtime); + let session_state = SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_default_features() + .build(); Ok(PySessionContext { ctx: SessionContext::new_with_state(session_state), }) diff --git a/src/dataset.rs b/src/dataset.rs index 724b4af76..b5704164f 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use datafusion::catalog::Session; use pyo3::exceptions::PyValueError; /// Implements a Datafusion TableProvider that delegates to a PyArrow Dataset /// This allows us to use PyArrow Datasets as Datafusion tables while pushing down projections and filters @@ -30,7 +31,6 @@ use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::{DataFusionError, Result as DFResult}; -use datafusion::execution::context::SessionState; use datafusion::logical_expr::TableProviderFilterPushDown; use datafusion::physical_plan::ExecutionPlan; use datafusion_expr::Expr; @@ -98,7 +98,7 @@ impl TableProvider for Dataset { /// parallelized or distributed. async fn scan( &self, - _ctx: &SessionState, + _ctx: &dyn Session, projection: Option<&Vec>, filters: &[Expr], // limit can be used to reduce the amount scanned diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index 626d92c79..e3d1bb136 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -126,9 +126,9 @@ impl PyAggregate { match expr { // TODO: This Alias logic seems to be returning some strange results that we should investigate Expr::Alias(Alias { expr, .. }) => self._aggregation_arguments(expr.as_ref()), - Expr::AggregateFunction(AggregateFunction { - func_def: _, args, .. - }) => Ok(args.iter().map(|e| PyExpr::from(e.clone())).collect()), + Expr::AggregateFunction(AggregateFunction { func: _, args, .. }) => { + Ok(args.iter().map(|e| PyExpr::from(e.clone())).collect()) + } _ => Err(py_type_err( "Encountered a non Aggregate type in aggregation_arguments", )), @@ -138,9 +138,7 @@ impl PyAggregate { fn _agg_func_name(expr: &Expr) -> PyResult { match expr { Expr::Alias(Alias { expr, .. }) => Self::_agg_func_name(expr.as_ref()), - Expr::AggregateFunction(AggregateFunction { func_def, .. }) => { - Ok(func_def.name().to_owned()) - } + Expr::AggregateFunction(AggregateFunction { func, .. }) => Ok(func.name().to_owned()), _ => Err(py_type_err( "Encountered a non Aggregate type in agg_func_name", )), diff --git a/src/expr/aggregate_expr.rs b/src/expr/aggregate_expr.rs index 04ec29a15..15097e007 100644 --- a/src/expr/aggregate_expr.rs +++ b/src/expr/aggregate_expr.rs @@ -41,7 +41,7 @@ impl From for PyAggregateFunction { impl Display for PyAggregateFunction { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { let args: Vec = self.aggr.args.iter().map(|expr| expr.to_string()).collect(); - write!(f, "{}({})", self.aggr.func_def.name(), args.join(", ")) + write!(f, "{}({})", self.aggr.func.name(), args.join(", ")) } } @@ -49,7 +49,7 @@ impl Display for PyAggregateFunction { impl PyAggregateFunction { /// Get the aggregate type, such as "MIN", or "MAX" fn aggregate_type(&self) -> String { - self.aggr.func_def.name().to_string() + self.aggr.func.name().to_string() } /// is this a distinct aggregate such as `COUNT(DISTINCT expr)` diff --git a/src/functions.rs b/src/functions.rs index f8f478166..c53d4ad92 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::functions_aggregate::all_default_aggregate_functions; -use datafusion_expr::AggregateExt; +use datafusion_expr::ExprFunctionExt; use pyo3::{prelude::*, wrap_pyfunction}; use crate::common::data_type::NullTreatment; @@ -30,16 +30,15 @@ use datafusion::functions; use datafusion::functions_aggregate; use datafusion_common::{Column, ScalarValue, TableReference}; use datafusion_expr::expr::Alias; +use datafusion_expr::sqlparser::ast::NullTreatment as DFNullTreatment; use datafusion_expr::{ - expr::{ - find_df_window_func, AggregateFunction, AggregateFunctionDefinition, Sort, WindowFunction, - }, + expr::{find_df_window_func, AggregateFunction, Sort, WindowFunction}, lit, Expr, WindowFunctionDefinition, }; #[pyfunction] pub fn approx_distinct(expression: PyExpr) -> PyExpr { - functions_aggregate::expr_fn::approx_distinct::approx_distinct(expression.expr).into() + functions_aggregate::expr_fn::approx_distinct(expression.expr).into() } #[pyfunction] @@ -342,9 +341,8 @@ pub fn first_value( builder = builder.filter(filter.expr); } - if let Some(null_treatment) = null_treatment { - builder = builder.null_treatment(null_treatment.into()) - } + // would be nice if all the options builder methods accepted Option ... + builder = builder.null_treatment(null_treatment.map(DFNullTreatment::from)); Ok(builder.build()?.into()) } @@ -373,9 +371,7 @@ pub fn last_value( builder = builder.filter(filter.expr); } - if let Some(null_treatment) = null_treatment { - builder = builder.null_treatment(null_treatment.into()) - } + builder = builder.null_treatment(null_treatment.map(DFNullTreatment::from)); Ok(builder.build()?.into()) } @@ -392,14 +388,14 @@ fn in_list(expr: PyExpr, value: Vec, negated: bool) -> PyExpr { #[pyfunction] fn make_array(exprs: Vec) -> PyExpr { - datafusion_functions_array::expr_fn::make_array(exprs.into_iter().map(|x| x.into()).collect()) + datafusion_functions_nested::expr_fn::make_array(exprs.into_iter().map(|x| x.into()).collect()) .into() } #[pyfunction] fn array_concat(exprs: Vec) -> PyExpr { let exprs = exprs.into_iter().map(|x| x.into()).collect(); - datafusion_functions_array::expr_fn::array_concat(exprs).into() + datafusion_functions_nested::expr_fn::array_concat(exprs).into() } #[pyfunction] @@ -411,12 +407,12 @@ fn array_cat(exprs: Vec) -> PyExpr { fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { let index = ScalarValue::Int64(index); let index = Expr::Literal(index); - datafusion_functions_array::expr_fn::array_position(array.into(), element.into(), index).into() + datafusion_functions_nested::expr_fn::array_position(array.into(), element.into(), index).into() } #[pyfunction] fn array_slice(array: PyExpr, begin: PyExpr, end: PyExpr, stride: Option) -> PyExpr { - datafusion_functions_array::expr_fn::array_slice( + datafusion_functions_nested::expr_fn::array_slice( array.into(), begin.into(), end.into(), @@ -638,18 +634,16 @@ fn window( } macro_rules! aggregate_function { - ($NAME: ident, $FUNC: ident) => { + ($NAME: ident, $FUNC: path) => { aggregate_function!($NAME, $FUNC, stringify!($NAME)); }; - ($NAME: ident, $FUNC: ident, $DOC: expr) => { + ($NAME: ident, $FUNC: path, $DOC: expr) => { #[doc = $DOC] #[pyfunction] #[pyo3(signature = (*args, distinct=false))] fn $NAME(args: Vec, distinct: bool) -> PyExpr { let expr = datafusion_expr::Expr::AggregateFunction(AggregateFunction { - func_def: AggregateFunctionDefinition::BuiltIn( - datafusion_expr::aggregate_function::AggregateFunction::$FUNC, - ), + func: $FUNC(), args: args.into_iter().map(|e| e.into()).collect(), distinct, filter: None, @@ -701,7 +695,7 @@ macro_rules! expr_fn_vec { }; } -/// Generates a [pyo3] wrapper for [datafusion_functions_array::expr_fn] +/// Generates a [pyo3] wrapper for [datafusion_functions_nested::expr_fn] /// /// These functions have explicit named arguments. macro_rules! array_fn { @@ -718,7 +712,7 @@ macro_rules! array_fn { #[doc = $DOC] #[pyfunction] fn $FUNC($($arg: PyExpr),*) -> PyExpr { - datafusion_functions_array::expr_fn::$FUNC($($arg.into()),*).into() + datafusion_functions_nested::expr_fn::$FUNC($($arg.into()),*).into() } }; } @@ -884,9 +878,9 @@ array_fn!(array_resize, array size value); array_fn!(flatten, array); array_fn!(range, start stop step); -aggregate_function!(array_agg, ArrayAgg); -aggregate_function!(max, Max); -aggregate_function!(min, Min); +aggregate_function!(array_agg, functions_aggregate::array_agg::array_agg_udaf); +aggregate_function!(max, functions_aggregate::min_max::max_udaf); +aggregate_function!(min, functions_aggregate::min_max::min_udaf); pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(abs))?; From 02414d81b2dae6e9a019a079ce7b7958b80ca538 Mon Sep 17 00:00:00 2001 From: Jim Crist-Harif Date: Fri, 23 Aug 2024 14:29:39 -0500 Subject: [PATCH 017/248] Fix SessionContext init with only SessionConfig (#827) Previously creating a `SessionContext` when specifying only a `SessionConfig` would error. --- python/datafusion/context.py | 2 +- python/datafusion/tests/test_context.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 47f2b9cf9..d4e50cfe2 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -445,7 +445,7 @@ def __init__( df = ctx.read_csv("data.csv") """ config = config.config_internal if config is not None else None - runtime = runtime.config_internal if config is not None else None + runtime = runtime.config_internal if runtime is not None else None self.ctx = SessionContextInternal(config, runtime) diff --git a/python/datafusion/tests/test_context.py b/python/datafusion/tests/test_context.py index 1b424db8b..66d7e013a 100644 --- a/python/datafusion/tests/test_context.py +++ b/python/datafusion/tests/test_context.py @@ -38,6 +38,14 @@ def test_create_context_no_args(): SessionContext() +def test_create_context_session_config_only(): + SessionContext(config=SessionConfig()) + + +def test_create_context_runtime_config_only(): + SessionContext(runtime=RuntimeConfig()) + + @pytest.mark.parametrize("path_to_str", (True, False)) def test_runtime_configs(tmp_path, path_to_str): path1 = tmp_path / "dir1" From 22c70ef55fb56e38d9f4465b974b33b3685a6a40 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Fri, 23 Aug 2024 14:30:52 -0500 Subject: [PATCH 018/248] build(deps): upgrade actions/{upload,download}-artifact@v3 to v4 (#829) * update actions/upload-artifact from v3 to v4 * update actions/download-artifact from v3 to v4 * build(dist): each build artifact gets a unique key and merged separately Uploads are now immutable for actions/upload-artifact@v4. This gives each build artifact a unique key, and then merges them together at the end. The [migration guide] suggests this pattern. [migration guide]: https://github.com/actions/upload-artifact/blob/main/docs/MIGRATION.md#merging-multiple-artifacts * use matrix.os to create unique upload key for build-python-mac-win job * use manylinux-x86_64 name for consistency with manylinux-aarch64 --- .github/workflows/build.yml | 59 ++++++++++++++++++++++++------------- .github/workflows/conda.yml | 2 +- 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 350be46d5..0b974f2d0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -51,7 +51,7 @@ jobs: override: true - name: Generate license file run: python ./dev/create_license.py - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: python-wheel-license path: LICENSE.txt @@ -84,7 +84,7 @@ jobs: - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-wheel-license path: . @@ -110,9 +110,9 @@ jobs: run: find target/wheels/ - name: Archive wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: dist + name: dist-${{ matrix.os }} path: target/wheels/* build-macos-aarch64: @@ -145,7 +145,7 @@ jobs: - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-wheel-license path: . @@ -162,12 +162,12 @@ jobs: run: find target/wheels/ - name: Archive wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: dist + name: dist-macos-aarch64 path: target/wheels/* - build-manylinux: + build-manylinux-x86_64: needs: [generate-license] name: Manylinux runs-on: ubuntu-latest @@ -175,7 +175,7 @@ jobs: - uses: actions/checkout@v4 - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-wheel-license path: . @@ -191,9 +191,9 @@ jobs: rustup-components: rust-std rustfmt # Keep them in one line due to https://github.com/PyO3/maturin-action/issues/153 args: --release --manylinux 2014 --features protoc,substrait - name: Archive wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: dist + name: dist-manylinux-x86_64 path: target/wheels/* build-manylinux-aarch64: @@ -204,7 +204,7 @@ jobs: - uses: actions/checkout@v4 - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-wheel-license path: . @@ -221,9 +221,9 @@ jobs: rustup-components: rust-std rustfmt # Keep them in one line due to https://github.com/PyO3/maturin-action/issues/153 args: --release --features protoc,substrait - name: Archive wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: dist + name: dist-manylinux-aarch64 path: target/wheels/* build-sdist: @@ -234,7 +234,7 @@ jobs: - uses: actions/checkout@v4 - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: python-wheel-license path: . @@ -246,19 +246,38 @@ jobs: manylinux: auto rustup-components: rust-std rustfmt args: --release --sdist --out dist --features protoc,substrait - - name: Archive wheels - uses: actions/upload-artifact@v3 + - name: Assert sdist build does not generate wheels + run: | + if [ "$(ls -A target/wheels)" ]; then + echo "Error: Sdist build generated wheels" + exit 1 + else + echo "Directory is clean" + fi + shell: bash + + merge-build-artifacts: + runs-on: ubuntu-latest + needs: + - build-python-mac-win + - build-macos-aarch64 + - build-manylinux-x86_64 + - build-manylinux-aarch64 + - build-sdist + steps: + - name: Merge Build Artifacts + uses: actions/upload-artifact/merge@v4 with: name: dist - path: target/wheels/* - + pattern: dist-* + # NOTE: PyPI publish needs to be done manually for now after release passed the vote # release: # name: Publish in PyPI # needs: [build-manylinux, build-python-mac-win] # runs-on: ubuntu-latest # steps: - # - uses: actions/download-artifact@v3 + # - uses: actions/download-artifact@v4 # - name: Publish to PyPI # uses: pypa/gh-action-pypi-publish@master # with: diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index f25a431c4..52888cb14 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -101,7 +101,7 @@ jobs: run: | conda mambabuild --test packages/${{ matrix.arch }}/*.tar.bz2 - name: Upload conda packages as artifacts - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: "conda nightlies (python - ${{ matrix.python }}, arch - ${{ matrix.arch }})" # need to install all conda channel metadata to properly install locally From 766e2edccb766aacd1bc9de6e8ed1c356f69486a Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 27 Aug 2024 12:18:46 -0400 Subject: [PATCH 019/248] Run ruff format in CI (#837) * Run ruff format in CI * Add --check parameter * Apply ruff format --- .github/workflows/build.yml | 4 +- python/datafusion/functions.py | 9 +- python/datafusion/tests/test_aggregation.py | 105 ++++++++------ python/datafusion/tests/test_dataframe.py | 90 ++++++------ python/datafusion/tests/test_expr.py | 14 +- python/datafusion/tests/test_functions.py | 150 +++++++++++++------- python/datafusion/tests/test_sql.py | 32 +++-- 7 files changed, 251 insertions(+), 153 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0b974f2d0..a4f8b2da5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -38,7 +38,9 @@ jobs: pip install ruff # Update output format to enable automatic inline annotations. - name: Run Ruff - run: ruff check --output-format=github python/ + run: | + ruff check --output-format=github python/ + ruff format --check python/ generate-license: runs-on: ubuntu-latest diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 2d3d87ee0..59a1974fd 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1479,12 +1479,17 @@ def approx_percentile_cont( """Returns the value that is approximately at a given percentile of ``expr``.""" if num_centroids is None: return Expr( - f.approx_percentile_cont(expression.expr, percentile.expr, distinct=distinct, num_centroids=None) + f.approx_percentile_cont( + expression.expr, percentile.expr, distinct=distinct, num_centroids=None + ) ) return Expr( f.approx_percentile_cont( - expression.expr, percentile.expr, distinct=distinct, num_centroids=num_centroids.expr + expression.expr, + percentile.expr, + distinct=distinct, + num_centroids=num_centroids.expr, ) ) diff --git a/python/datafusion/tests/test_aggregation.py b/python/datafusion/tests/test_aggregation.py index 03485da4b..ab653c403 100644 --- a/python/datafusion/tests/test_aggregation.py +++ b/python/datafusion/tests/test_aggregation.py @@ -39,6 +39,7 @@ def df(): ) return ctx.create_dataframe([[batch]]) + @pytest.fixture def df_aggregate_100(): ctx = SessionContext() @@ -46,32 +47,46 @@ def df_aggregate_100(): return ctx.table("aggregate_test_data") -@pytest.mark.parametrize("agg_expr, calc_expected", [ - (f.avg(column("a")), lambda a, b, c, d: np.array(np.average(a))), - (f.corr(column("a"), column("b")), lambda a, b, c, d: np.array(np.corrcoef(a, b)[0][1])), - (f.count(column("a")), lambda a, b, c, d: pa.array([len(a)])), - # Sample (co)variance -> ddof=1 - # Population (co)variance -> ddof=0 - (f.covar(column("a"), column("b")), lambda a, b, c, d: np.array(np.cov(a, b, ddof=1)[0][1])), - (f.covar_pop(column("a"), column("c")), lambda a, b, c, d: np.array(np.cov(a, c, ddof=0)[0][1])), - (f.covar_samp(column("b"), column("c")), lambda a, b, c, d: np.array(np.cov(b, c, ddof=1)[0][1])), - # f.grouping(col_a), # No physical plan implemented yet - (f.max(column("a")), lambda a, b, c, d: np.array(np.max(a))), - (f.mean(column("b")), lambda a, b, c, d: np.array(np.mean(b))), - (f.median(column("b")), lambda a, b, c, d: np.array(np.median(b))), - (f.min(column("a")), lambda a, b, c, d: np.array(np.min(a))), - (f.sum(column("b")), lambda a, b, c, d: np.array(np.sum(b.to_pylist()))), - # Sample stdev -> ddof=1 - # Population stdev -> ddof=0 - (f.stddev(column("a")), lambda a, b, c, d: np.array(np.std(a, ddof=1))), - (f.stddev_pop(column("b")), lambda a, b, c, d: np.array(np.std(b, ddof=0))), - (f.stddev_samp(column("c")), lambda a, b, c, d: np.array(np.std(c, ddof=1))), - (f.var(column("a")), lambda a, b, c, d: np.array(np.var(a, ddof=1))), - (f.var_pop(column("b")), lambda a, b, c, d: np.array(np.var(b, ddof=0))), - (f.var_samp(column("c")), lambda a, b, c, d: np.array(np.var(c, ddof=1))), -]) +@pytest.mark.parametrize( + "agg_expr, calc_expected", + [ + (f.avg(column("a")), lambda a, b, c, d: np.array(np.average(a))), + ( + f.corr(column("a"), column("b")), + lambda a, b, c, d: np.array(np.corrcoef(a, b)[0][1]), + ), + (f.count(column("a")), lambda a, b, c, d: pa.array([len(a)])), + # Sample (co)variance -> ddof=1 + # Population (co)variance -> ddof=0 + ( + f.covar(column("a"), column("b")), + lambda a, b, c, d: np.array(np.cov(a, b, ddof=1)[0][1]), + ), + ( + f.covar_pop(column("a"), column("c")), + lambda a, b, c, d: np.array(np.cov(a, c, ddof=0)[0][1]), + ), + ( + f.covar_samp(column("b"), column("c")), + lambda a, b, c, d: np.array(np.cov(b, c, ddof=1)[0][1]), + ), + # f.grouping(col_a), # No physical plan implemented yet + (f.max(column("a")), lambda a, b, c, d: np.array(np.max(a))), + (f.mean(column("b")), lambda a, b, c, d: np.array(np.mean(b))), + (f.median(column("b")), lambda a, b, c, d: np.array(np.median(b))), + (f.min(column("a")), lambda a, b, c, d: np.array(np.min(a))), + (f.sum(column("b")), lambda a, b, c, d: np.array(np.sum(b.to_pylist()))), + # Sample stdev -> ddof=1 + # Population stdev -> ddof=0 + (f.stddev(column("a")), lambda a, b, c, d: np.array(np.std(a, ddof=1))), + (f.stddev_pop(column("b")), lambda a, b, c, d: np.array(np.std(b, ddof=0))), + (f.stddev_samp(column("c")), lambda a, b, c, d: np.array(np.std(c, ddof=1))), + (f.var(column("a")), lambda a, b, c, d: np.array(np.var(a, ddof=1))), + (f.var_pop(column("b")), lambda a, b, c, d: np.array(np.var(b, ddof=0))), + (f.var_samp(column("c")), lambda a, b, c, d: np.array(np.var(c, ddof=1))), + ], +) def test_aggregation_stats(df, agg_expr, calc_expected): - agg_df = df.aggregate([], [agg_expr]) result = agg_df.collect()[0] values_a, values_b, values_c, values_d = df.collect()[0] @@ -79,16 +94,19 @@ def test_aggregation_stats(df, agg_expr, calc_expected): np.testing.assert_array_almost_equal(result.column(0), expected) -@pytest.mark.parametrize("agg_expr, expected", [ - (f.approx_distinct(column("b")), pa.array([2], type=pa.uint64())), - (f.approx_median(column("b")), pa.array([4])), - (f.approx_percentile_cont(column("b"), lit(0.5)), pa.array([4])), - ( - f.approx_percentile_cont_with_weight(column("b"), lit(0.6), lit(0.5)), - pa.array([6], type=pa.float64()) - ), - (f.array_agg(column("b")), pa.array([[4, 4, 6]])), -]) +@pytest.mark.parametrize( + "agg_expr, expected", + [ + (f.approx_distinct(column("b")), pa.array([2], type=pa.uint64())), + (f.approx_median(column("b")), pa.array([4])), + (f.approx_percentile_cont(column("b"), lit(0.5)), pa.array([4])), + ( + f.approx_percentile_cont_with_weight(column("b"), lit(0.6), lit(0.5)), + pa.array([6], type=pa.float64()), + ), + (f.array_agg(column("b")), pa.array([[4, 4, 6]])), + ], +) def test_aggregation(df, agg_expr, expected): agg_df = df.aggregate([], [agg_expr]) result = agg_df.collect()[0] @@ -98,20 +116,21 @@ def test_aggregation(df, agg_expr, expected): def test_aggregate_100(df_aggregate_100): # https://github.com/apache/datafusion/blob/bddb6415a50746d2803dd908d19c3758952d74f9/datafusion/sqllogictest/test_files/aggregate.slt#L1490-L1498 - result = df_aggregate_100.aggregate( - [ - column("c1") - ], - [ - f.approx_percentile_cont(column("c3"), lit(0.95), lit(200)).alias("c3") - ] - ).sort(column("c1").sort(ascending=True)).collect() + result = ( + df_aggregate_100.aggregate( + [column("c1")], + [f.approx_percentile_cont(column("c3"), lit(0.95), lit(200)).alias("c3")], + ) + .sort(column("c1").sort(ascending=True)) + .collect() + ) assert len(result) == 1 result = result[0] assert result.column("c1") == pa.array(["a", "b", "c", "d", "e"]) assert result.column("c3") == pa.array([73, 68, 122, 124, 115]) + def test_bit_add_or_xor(df): df = df.aggregate( [], diff --git a/python/datafusion/tests/test_dataframe.py b/python/datafusion/tests/test_dataframe.py index 6444d9321..e5e0c9c8a 100644 --- a/python/datafusion/tests/test_dataframe.py +++ b/python/datafusion/tests/test_dataframe.py @@ -279,57 +279,67 @@ def test_distinct(): data_test_window_functions = [ - ("row", f.window("row_number", [], order_by=[f.order_by(column("c"))]), [2, 1, 3]), - ("rank", f.window("rank", [], order_by=[f.order_by(column("c"))]), [2, 1, 2]), - ("dense_rank", f.window("dense_rank", [], order_by=[f.order_by(column("c"))]), [2, 1, 2] ), - ("percent_rank", f.window("percent_rank", [], order_by=[f.order_by(column("c"))]), [0.5, 0, 0.5]), - ("cume_dist", f.window("cume_dist", [], order_by=[f.order_by(column("b"))]), [0.3333333333333333, 0.6666666666666666, 1.0]), - ("ntile", f.window("ntile", [literal(2)], order_by=[f.order_by(column("c"))]), [1, 1, 2]), - ("next", f.window("lead", [column("b")], order_by=[f.order_by(column("b"))]), [5, 6, None]), - ("previous", f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]), [None, 4, 5]), - pytest.param( - "first_value", - f.window( + ("row", f.window("row_number", [], order_by=[f.order_by(column("c"))]), [2, 1, 3]), + ("rank", f.window("rank", [], order_by=[f.order_by(column("c"))]), [2, 1, 2]), + ( + "dense_rank", + f.window("dense_rank", [], order_by=[f.order_by(column("c"))]), + [2, 1, 2], + ), + ( + "percent_rank", + f.window("percent_rank", [], order_by=[f.order_by(column("c"))]), + [0.5, 0, 0.5], + ), + ( + "cume_dist", + f.window("cume_dist", [], order_by=[f.order_by(column("b"))]), + [0.3333333333333333, 0.6666666666666666, 1.0], + ), + ( + "ntile", + f.window("ntile", [literal(2)], order_by=[f.order_by(column("c"))]), + [1, 1, 2], + ), + ( + "next", + f.window("lead", [column("b")], order_by=[f.order_by(column("b"))]), + [5, 6, None], + ), + ( + "previous", + f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]), + [None, 4, 5], + ), + pytest.param( "first_value", - [column("a")], - order_by=[f.order_by(column("b"))] + f.window("first_value", [column("a")], order_by=[f.order_by(column("b"))]), + [1, 1, 1], + ), + pytest.param( + "last_value", + f.window("last_value", [column("b")], order_by=[f.order_by(column("b"))]), + [4, 5, 6], ), - [1, 1, 1], - ), - pytest.param( - "last_value", - f.window("last_value", [column("b")], order_by=[f.order_by(column("b"))]), - [4, 5, 6], - ), - pytest.param( - "2nd_value", - f.window( - "nth_value", - [column("b"), literal(2)], - order_by=[f.order_by(column("b"))], + pytest.param( + "2nd_value", + f.window( + "nth_value", + [column("b"), literal(2)], + order_by=[f.order_by(column("b"))], + ), + [None, 5, 5], ), - [None, 5, 5], - ), ] @pytest.mark.parametrize("name,expr,result", data_test_window_functions) def test_window_functions(df, name, expr, result): - df = df.select( - column("a"), - column("b"), - column("c"), - f.alias(expr, name) - ) + df = df.select(column("a"), column("b"), column("c"), f.alias(expr, name)) table = pa.Table.from_batches(df.collect()) - expected = { - "a": [1, 2, 3], - "b": [4, 5, 6], - "c": [8, 5, 8], - name: result - } + expected = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8], name: result} assert table.sort_by("a").to_pydict() == expected diff --git a/python/datafusion/tests/test_expr.py b/python/datafusion/tests/test_expr.py index 1a41120a5..9071108cb 100644 --- a/python/datafusion/tests/test_expr.py +++ b/python/datafusion/tests/test_expr.py @@ -146,24 +146,26 @@ def test_expr_to_variant(): from datafusion import SessionContext from datafusion.expr import Filter - def traverse_logical_plan(plan): cur_node = plan.to_variant() if isinstance(cur_node, Filter): return cur_node.predicate().to_variant() - if hasattr(plan, 'inputs'): + if hasattr(plan, "inputs"): for input_plan in plan.inputs(): res = traverse_logical_plan(input_plan) if res is not None: return res ctx = SessionContext() - data = {'id': [1, 2, 3], 'name': ['Alice', 'Bob', 'Charlie']} - ctx.from_pydict(data, name='table1') + data = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]} + ctx.from_pydict(data, name="table1") query = "SELECT * FROM table1 t1 WHERE t1.name IN ('dfa', 'ad', 'dfre', 'vsa')" logical_plan = ctx.sql(query).optimized_logical_plan() variant = traverse_logical_plan(logical_plan) assert variant is not None - assert variant.expr().to_variant().qualified_name() == 'table1.name' - assert str(variant.list()) == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' + assert variant.expr().to_variant().qualified_name() == "table1.name" + assert ( + str(variant.list()) + == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' + ) assert not variant.negated() diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index b8ad9c0d7..732136eaa 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -567,45 +567,86 @@ def test_array_function_obj_tests(stmt, py_expr): assert a == b -@pytest.mark.parametrize("function, expected_result", [ - (f.ascii(column("a")), pa.array([72, 87, 33], type=pa.int32())), # H = 72; W = 87; ! = 33 - (f.bit_length(column("a")), pa.array([40, 40, 8], type=pa.int32())), - (f.btrim(literal(" World ")), pa.array(["World", "World", "World"])), - (f.character_length(column("a")), pa.array([5, 5, 1], type=pa.int32())), - (f.chr(literal(68)), pa.array(["D", "D", "D"])), - (f.concat_ws("-", column("a"), literal("test")), pa.array(["Hello-test", "World-test", "!-test"])), - (f.concat(column("a"), literal("?")), pa.array(["Hello?", "World?", "!?"])), - (f.initcap(column("c")), pa.array(["Hello ", " World ", " !"])), - (f.left(column("a"), literal(3)), pa.array(["Hel", "Wor", "!"])), - (f.length(column("c")), pa.array([6, 7, 2], type=pa.int32())), - (f.lower(column("a")), pa.array(["hello", "world", "!"])), - (f.lpad(column("a"), literal(7)), pa.array([" Hello", " World", " !"])), - (f.ltrim(column("c")), pa.array(["hello ", "world ", "!"])), - (f.md5(column("a")), pa.array([ - "8b1a9953c4611296a827abf8c47804d7", - "f5a7924e621e84c9280a9a27e1bcb7f6", - "9033e0e305f247c0c3c80d0c7848c8b3", - ])), - (f.octet_length(column("a")), pa.array([5, 5, 1], type=pa.int32())), - (f.repeat(column("a"), literal(2)), pa.array(["HelloHello", "WorldWorld", "!!"])), - (f.replace(column("a"), literal("l"), literal("?")), pa.array(["He??o", "Wor?d", "!"])), - (f.reverse(column("a")), pa.array(["olleH", "dlroW", "!"])), - (f.right(column("a"), literal(4)), pa.array(["ello", "orld", "!"])), - (f.rpad(column("a"), literal(8)), pa.array(["Hello ", "World ", "! "])), - (f.rtrim(column("c")), pa.array(["hello", " world", " !"])), - (f.split_part(column("a"), literal("l"), literal(1)), pa.array(["He", "Wor", "!"])), - (f.starts_with(column("a"), literal("Wor")), pa.array([False, True, False])), - (f.strpos(column("a"), literal("o")), pa.array([5, 2, 0], type=pa.int32())), - (f.substr(column("a"), literal(3)), pa.array(["llo", "rld", ""])), - (f.translate(column("a"), literal("or"), literal("ld")), pa.array(["Helll", "Wldld", "!"])), - (f.trim(column("c")), pa.array(["hello", "world", "!"])), - (f.upper(column("c")), pa.array(["HELLO ", " WORLD ", " !"])), - (f.ends_with(column("a"), literal("llo")), pa.array([True, False, False])), - (f.overlay(column("a"), literal("--"), literal(2)), pa.array(["H--lo", "W--ld", "--"])), - (f.regexp_like(column("a"), literal("(ell|orl)")), pa.array([True, True, False])), - (f.regexp_match(column("a"), literal("(ell|orl)")), pa.array([["ell"], ["orl"], None])), - (f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), pa.array(["H-o", "W-d", "!"])), -]) +@pytest.mark.parametrize( + "function, expected_result", + [ + ( + f.ascii(column("a")), + pa.array([72, 87, 33], type=pa.int32()), + ), # H = 72; W = 87; ! = 33 + (f.bit_length(column("a")), pa.array([40, 40, 8], type=pa.int32())), + (f.btrim(literal(" World ")), pa.array(["World", "World", "World"])), + (f.character_length(column("a")), pa.array([5, 5, 1], type=pa.int32())), + (f.chr(literal(68)), pa.array(["D", "D", "D"])), + ( + f.concat_ws("-", column("a"), literal("test")), + pa.array(["Hello-test", "World-test", "!-test"]), + ), + (f.concat(column("a"), literal("?")), pa.array(["Hello?", "World?", "!?"])), + (f.initcap(column("c")), pa.array(["Hello ", " World ", " !"])), + (f.left(column("a"), literal(3)), pa.array(["Hel", "Wor", "!"])), + (f.length(column("c")), pa.array([6, 7, 2], type=pa.int32())), + (f.lower(column("a")), pa.array(["hello", "world", "!"])), + (f.lpad(column("a"), literal(7)), pa.array([" Hello", " World", " !"])), + (f.ltrim(column("c")), pa.array(["hello ", "world ", "!"])), + ( + f.md5(column("a")), + pa.array( + [ + "8b1a9953c4611296a827abf8c47804d7", + "f5a7924e621e84c9280a9a27e1bcb7f6", + "9033e0e305f247c0c3c80d0c7848c8b3", + ] + ), + ), + (f.octet_length(column("a")), pa.array([5, 5, 1], type=pa.int32())), + ( + f.repeat(column("a"), literal(2)), + pa.array(["HelloHello", "WorldWorld", "!!"]), + ), + ( + f.replace(column("a"), literal("l"), literal("?")), + pa.array(["He??o", "Wor?d", "!"]), + ), + (f.reverse(column("a")), pa.array(["olleH", "dlroW", "!"])), + (f.right(column("a"), literal(4)), pa.array(["ello", "orld", "!"])), + ( + f.rpad(column("a"), literal(8)), + pa.array(["Hello ", "World ", "! "]), + ), + (f.rtrim(column("c")), pa.array(["hello", " world", " !"])), + ( + f.split_part(column("a"), literal("l"), literal(1)), + pa.array(["He", "Wor", "!"]), + ), + (f.starts_with(column("a"), literal("Wor")), pa.array([False, True, False])), + (f.strpos(column("a"), literal("o")), pa.array([5, 2, 0], type=pa.int32())), + (f.substr(column("a"), literal(3)), pa.array(["llo", "rld", ""])), + ( + f.translate(column("a"), literal("or"), literal("ld")), + pa.array(["Helll", "Wldld", "!"]), + ), + (f.trim(column("c")), pa.array(["hello", "world", "!"])), + (f.upper(column("c")), pa.array(["HELLO ", " WORLD ", " !"])), + (f.ends_with(column("a"), literal("llo")), pa.array([True, False, False])), + ( + f.overlay(column("a"), literal("--"), literal(2)), + pa.array(["H--lo", "W--ld", "--"]), + ), + ( + f.regexp_like(column("a"), literal("(ell|orl)")), + pa.array([True, True, False]), + ), + ( + f.regexp_match(column("a"), literal("(ell|orl)")), + pa.array([["ell"], ["orl"], None]), + ), + ( + f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), + pa.array(["H-o", "W-d", "!"]), + ), + ], +) def test_string_functions(df, function, expected_result): df = df.select(function) result = df.collect() @@ -849,27 +890,30 @@ def test_regr_funcs_sql_2(): assert result_sql[0].column(8) == pa.array([4], type=pa.float64()) -@pytest.mark.parametrize("func, expected", [ - pytest.param(f.regr_slope, pa.array([2], type=pa.float64()), id="regr_slope"), - pytest.param(f.regr_intercept, pa.array([0], type=pa.float64()), id="regr_intercept"), - pytest.param(f.regr_count, pa.array([3], type=pa.uint64()), id="regr_count"), - pytest.param(f.regr_r2, pa.array([1], type=pa.float64()), id="regr_r2"), - pytest.param(f.regr_avgx, pa.array([2], type=pa.float64()), id="regr_avgx"), - pytest.param(f.regr_avgy, pa.array([4], type=pa.float64()), id="regr_avgy"), - pytest.param(f.regr_sxx, pa.array([2], type=pa.float64()), id="regr_sxx"), - pytest.param(f.regr_syy, pa.array([8], type=pa.float64()), id="regr_syy"), - pytest.param(f.regr_sxy, pa.array([4], type=pa.float64()), id="regr_sxy") -]) +@pytest.mark.parametrize( + "func, expected", + [ + pytest.param(f.regr_slope, pa.array([2], type=pa.float64()), id="regr_slope"), + pytest.param( + f.regr_intercept, pa.array([0], type=pa.float64()), id="regr_intercept" + ), + pytest.param(f.regr_count, pa.array([3], type=pa.uint64()), id="regr_count"), + pytest.param(f.regr_r2, pa.array([1], type=pa.float64()), id="regr_r2"), + pytest.param(f.regr_avgx, pa.array([2], type=pa.float64()), id="regr_avgx"), + pytest.param(f.regr_avgy, pa.array([4], type=pa.float64()), id="regr_avgy"), + pytest.param(f.regr_sxx, pa.array([2], type=pa.float64()), id="regr_sxx"), + pytest.param(f.regr_syy, pa.array([8], type=pa.float64()), id="regr_syy"), + pytest.param(f.regr_sxy, pa.array([4], type=pa.float64()), id="regr_sxy"), + ], +) def test_regr_funcs_df(func, expected): - # test case based on `regr_*() basic tests # https://github.com/apache/datafusion/blob/d1361d56b9a9e0c165d3d71a8df6795d2a5f51dd/datafusion/core/tests/sqllogictests/test_files/aggregate.slt#L2358C1-L2374C1 - ctx = SessionContext() # Create a DataFrame - data = {'column1': [1, 2, 3], 'column2': [2, 4, 6]} + data = {"column1": [1, 2, 3], "column2": [2, 4, 6]} df = ctx.from_pydict(data, name="test_table") # Perform the regression function using DataFrame API diff --git a/python/datafusion/tests/test_sql.py b/python/datafusion/tests/test_sql.py index 1505fb1e7..e41d01004 100644 --- a/python/datafusion/tests/test_sql.py +++ b/python/datafusion/tests/test_sql.py @@ -381,15 +381,31 @@ def test_udf( id="binary4", marks=pytest.mark.xfail, ), - pytest.param(helpers.data_datetime("s"), id="datetime_s", marks=pytest.mark.xfail), - pytest.param(helpers.data_datetime("ms"), id="datetime_ms", marks=pytest.mark.xfail), - pytest.param(helpers.data_datetime("us"), id="datetime_us", marks=pytest.mark.xfail), - pytest.param(helpers.data_datetime("ns"), id="datetime_ns", marks=pytest.mark.xfail), + pytest.param( + helpers.data_datetime("s"), id="datetime_s", marks=pytest.mark.xfail + ), + pytest.param( + helpers.data_datetime("ms"), id="datetime_ms", marks=pytest.mark.xfail + ), + pytest.param( + helpers.data_datetime("us"), id="datetime_us", marks=pytest.mark.xfail + ), + pytest.param( + helpers.data_datetime("ns"), id="datetime_ns", marks=pytest.mark.xfail + ), # Not writtable to parquet - pytest.param(helpers.data_timedelta("s"), id="timedelta_s", marks=pytest.mark.xfail), - pytest.param(helpers.data_timedelta("ms"), id="timedelta_ms", marks=pytest.mark.xfail), - pytest.param(helpers.data_timedelta("us"), id="timedelta_us", marks=pytest.mark.xfail), - pytest.param(helpers.data_timedelta("ns"), id="timedelta_ns", marks=pytest.mark.xfail), + pytest.param( + helpers.data_timedelta("s"), id="timedelta_s", marks=pytest.mark.xfail + ), + pytest.param( + helpers.data_timedelta("ms"), id="timedelta_ms", marks=pytest.mark.xfail + ), + pytest.param( + helpers.data_timedelta("us"), id="timedelta_us", marks=pytest.mark.xfail + ), + pytest.param( + helpers.data_timedelta("ns"), id="timedelta_ns", marks=pytest.mark.xfail + ), ], ) def test_simple_select(ctx, tmp_path, arr): From 69ed7fe41cc5524fcf2664af32aa78181e188e0b Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 30 Aug 2024 17:04:38 -0400 Subject: [PATCH 020/248] Add PyCapsule support for Arrow import and export (#825) --- docs/source/user-guide/io/arrow.rst | 73 ++++++++++++++ docs/source/user-guide/io/index.rst | 6 +- examples/import.py | 2 +- python/datafusion/context.py | 24 +++-- python/datafusion/dataframe.py | 16 +++ python/datafusion/tests/test_context.py | 37 ++++++- python/datafusion/tests/test_dataframe.py | 51 ++++++++-- src/context.rs | 46 ++++++--- src/dataframe.rs | 116 +++++++++++++++++++++- 9 files changed, 330 insertions(+), 41 deletions(-) create mode 100644 docs/source/user-guide/io/arrow.rst diff --git a/docs/source/user-guide/io/arrow.rst b/docs/source/user-guide/io/arrow.rst new file mode 100644 index 000000000..d571aa99c --- /dev/null +++ b/docs/source/user-guide/io/arrow.rst @@ -0,0 +1,73 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Arrow +===== + +DataFusion implements the +`Apache Arrow PyCapsule interface `_ +for importing and exporting DataFrames with zero copy. With this feature, any Python +project that implements this interface can share data back and forth with DataFusion +with zero copy. + +We can demonstrate using `pyarrow `_. + +Importing to DataFusion +----------------------- + +Here we will create an Arrow table and import it to DataFusion. + +To import an Arrow table, use :py:func:`datafusion.context.SessionContext.from_arrow`. +This will accept any Python object that implements +`__arrow_c_stream__ `_ +or `__arrow_c_array__ `_ +and returns a ``StructArray``. Common pyarrow sources you can use are: + +- `Array `_ (but it must return a Struct Array) +- `Record Batch `_ +- `Record Batch Reader `_ +- `Table `_ + +.. ipython:: python + + from datafusion import SessionContext + import pyarrow as pa + + data = {"a": [1, 2, 3], "b": [4, 5, 6]} + table = pa.Table.from_pydict(data) + + ctx = SessionContext() + df = ctx.from_arrow(table) + df + +Exporting from DataFusion +------------------------- + +DataFusion DataFrames implement ``__arrow_c_stream__`` PyCapsule interface, so any +Python library that accepts these can import a DataFusion DataFrame directly. + +.. warning:: + It is important to note that this will cause the DataFrame execution to happen, which may be + a time consuming task. That is, you will cause a + :py:func:`datafusion.dataframe.DataFrame.collect` operation call to occur. + + +.. ipython:: python + + df = df.select((col("a") * lit(1.5)).alias("c"), lit("df").alias("d")) + pa.table(df) + diff --git a/docs/source/user-guide/io/index.rst b/docs/source/user-guide/io/index.rst index af08240ff..05411327e 100644 --- a/docs/source/user-guide/io/index.rst +++ b/docs/source/user-guide/io/index.rst @@ -21,8 +21,8 @@ IO .. toctree:: :maxdepth: 2 + arrow + avro csv - parquet json - avro - + parquet diff --git a/examples/import.py b/examples/import.py index a249a1c4e..cd965cb46 100644 --- a/examples/import.py +++ b/examples/import.py @@ -54,5 +54,5 @@ # Convert Arrow Table to datafusion DataFrame arrow_table = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) -df = ctx.from_arrow_table(arrow_table) +df = ctx.from_arrow(arrow_table) assert type(df) == datafusion.DataFrame diff --git a/python/datafusion/context.py b/python/datafusion/context.py index d4e50cfe2..283f71e1e 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -586,19 +586,31 @@ def from_pydict( """ return DataFrame(self.ctx.from_pydict(data, name)) - def from_arrow_table( - self, data: pyarrow.Table, name: str | None = None - ) -> DataFrame: - """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow table. + def from_arrow(self, data: Any, name: str | None = None) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow source. + + The Arrow data source can be any object that implements either + ``__arrow_c_stream__`` or ``__arrow_c_array__``. For the latter, it must return + a struct array. Common examples of sources from pyarrow include Args: - data: Arrow table. + data: Arrow data source. name: Name of the DataFrame. Returns: DataFrame representation of the Arrow table. """ - return DataFrame(self.ctx.from_arrow_table(data, name)) + return DataFrame(self.ctx.from_arrow(data, name)) + + @deprecated("Use ``from_arrow`` instead.") + def from_arrow_table( + self, data: pyarrow.Table, name: str | None = None + ) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow table. + + This is an alias for :py:func:`from_arrow`. + """ + return self.from_arrow(data, name) def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Pandas DataFrame. diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index fa7398442..4f1760135 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -524,3 +524,19 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram """ columns = [c for c in columns] return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) + + def __arrow_c_stream__(self, requested_schema: pa.Schema) -> Any: + """Export an Arrow PyCapsule Stream. + + This will execute and collect the DataFrame. We will attempt to respect the + requested schema, but only trivial transformations will be applied such as only + returning the fields listed in the requested schema if their data types match + those in the DataFrame. + + Args: + requested_schema: Attempt to provide the DataFrame using this schema. + + Returns: + Arrow PyCapsule object. + """ + return self.df.__arrow_c_stream__(requested_schema) diff --git a/python/datafusion/tests/test_context.py b/python/datafusion/tests/test_context.py index 66d7e013a..0184280c2 100644 --- a/python/datafusion/tests/test_context.py +++ b/python/datafusion/tests/test_context.py @@ -156,7 +156,7 @@ def test_from_arrow_table(ctx): table = pa.Table.from_pydict(data) # convert to DataFrame - df = ctx.from_arrow_table(table) + df = ctx.from_arrow(table) tables = list(ctx.catalog().database().names()) assert df @@ -166,13 +166,42 @@ def test_from_arrow_table(ctx): assert df.collect()[0].num_rows == 3 +def record_batch_generator(num_batches: int): + schema = pa.schema([("a", pa.int64()), ("b", pa.int64())]) + for i in range(num_batches): + yield pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], schema=schema + ) + + +@pytest.mark.parametrize( + "source", + [ + # __arrow_c_array__ sources + pa.array([{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]), + # __arrow_c_stream__ sources + pa.RecordBatch.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}), + pa.RecordBatchReader.from_batches( + pa.schema([("a", pa.int64()), ("b", pa.int64())]), record_batch_generator(1) + ), + pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}), + ], +) +def test_from_arrow_sources(ctx, source) -> None: + df = ctx.from_arrow(source) + assert df + assert isinstance(df, DataFrame) + assert df.schema().names == ["a", "b"] + assert df.count() == 3 + + def test_from_arrow_table_with_name(ctx): # create a PyArrow table data = {"a": [1, 2, 3], "b": [4, 5, 6]} table = pa.Table.from_pydict(data) # convert to DataFrame with optional name - df = ctx.from_arrow_table(table, name="tbl") + df = ctx.from_arrow(table, name="tbl") tables = list(ctx.catalog().database().names()) assert df @@ -185,7 +214,7 @@ def test_from_arrow_table_empty(ctx): table = pa.Table.from_pydict(data, schema=schema) # convert to DataFrame - df = ctx.from_arrow_table(table) + df = ctx.from_arrow(table) tables = list(ctx.catalog().database().names()) assert df @@ -200,7 +229,7 @@ def test_from_arrow_table_empty_no_schema(ctx): table = pa.Table.from_pydict(data) # convert to DataFrame - df = ctx.from_arrow_table(table) + df = ctx.from_arrow(table) tables = list(ctx.catalog().database().names()) assert df diff --git a/python/datafusion/tests/test_dataframe.py b/python/datafusion/tests/test_dataframe.py index e5e0c9c8a..477bc0fce 100644 --- a/python/datafusion/tests/test_dataframe.py +++ b/python/datafusion/tests/test_dataframe.py @@ -47,7 +47,7 @@ def df(): names=["a", "b", "c"], ) - return ctx.create_dataframe([[batch]]) + return ctx.from_arrow(batch) @pytest.fixture @@ -835,13 +835,42 @@ def test_write_compressed_parquet_missing_compression_level(df, tmp_path, compre df.write_parquet(str(path), compression=compression) -# ctx = SessionContext() - -# # create a RecordBatch and a new DataFrame from it -# batch = pa.RecordBatch.from_arrays( -# [pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([8, 5, 8])], -# names=["a", "b", "c"], -# ) - -# df = ctx.create_dataframe([[batch]]) -# test_execute_stream(df) +def test_dataframe_export(df) -> None: + # Guarantees that we have the canonical implementation + # reading our dataframe export + table = pa.table(df) + assert table.num_columns == 3 + assert table.num_rows == 3 + + desired_schema = pa.schema([("a", pa.int64())]) + + # Verify we can request a schema + table = pa.table(df, schema=desired_schema) + assert table.num_columns == 1 + assert table.num_rows == 3 + + # Expect a table of nulls if the schema don't overlap + desired_schema = pa.schema([("g", pa.string())]) + table = pa.table(df, schema=desired_schema) + assert table.num_columns == 1 + assert table.num_rows == 3 + for i in range(0, 3): + assert table[0][i].as_py() is None + + # Expect an error when we cannot convert schema + desired_schema = pa.schema([("a", pa.float32())]) + failed_convert = False + try: + table = pa.table(df, schema=desired_schema) + except Exception: + failed_convert = True + assert failed_convert + + # Expect an error when we have a not set non-nullable + desired_schema = pa.schema([("g", pa.string(), False)]) + failed_convert = False + try: + table = pa.table(df, schema=desired_schema) + except Exception: + failed_convert = True + assert failed_convert diff --git a/src/context.rs b/src/context.rs index a43599cf7..4433d94c2 100644 --- a/src/context.rs +++ b/src/context.rs @@ -20,12 +20,15 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; +use arrow::array::RecordBatchReader; +use arrow::ffi_stream::ArrowArrayStreamReader; +use arrow::pyarrow::FromPyArrow; use datafusion::execution::session_state::SessionStateBuilder; use object_store::ObjectStore; use url::Url; use uuid::Uuid; -use pyo3::exceptions::{PyKeyError, PyValueError}; +use pyo3::exceptions::{PyKeyError, PyTypeError, PyValueError}; use pyo3::prelude::*; use crate::catalog::{PyCatalog, PyTable}; @@ -444,7 +447,7 @@ impl PySessionContext { let table = table_class.call_method1("from_pylist", args)?; // Convert Arrow Table to datafusion DataFrame - let df = self.from_arrow_table(table, name, py)?; + let df = self.from_arrow(table, name, py)?; Ok(df) } @@ -463,29 +466,42 @@ impl PySessionContext { let table = table_class.call_method1("from_pydict", args)?; // Convert Arrow Table to datafusion DataFrame - let df = self.from_arrow_table(table, name, py)?; + let df = self.from_arrow(table, name, py)?; Ok(df) } /// Construct datafusion dataframe from Arrow Table - pub fn from_arrow_table( + pub fn from_arrow( &mut self, data: Bound<'_, PyAny>, name: Option<&str>, py: Python, ) -> PyResult { - // Instantiate pyarrow Table object & convert to batches - let table = data.call_method0("to_batches")?; + let (schema, batches) = + if let Ok(stream_reader) = ArrowArrayStreamReader::from_pyarrow_bound(&data) { + // Works for any object that implements __arrow_c_stream__ in pycapsule. + + let schema = stream_reader.schema().as_ref().to_owned(); + let batches = stream_reader + .collect::, arrow::error::ArrowError>>() + .map_err(DataFusionError::from)?; + + (schema, batches) + } else if let Ok(array) = RecordBatch::from_pyarrow_bound(&data) { + // While this says RecordBatch, it will work for any object that implements + // __arrow_c_array__ and returns a StructArray. + + (array.schema().as_ref().to_owned(), vec![array]) + } else { + return Err(PyTypeError::new_err( + "Expected either a Arrow Array or Arrow Stream in from_arrow().", + )); + }; - let schema = data.getattr("schema")?; - let schema = schema.extract::>()?; - - // Cast PyAny to RecordBatch type // Because create_dataframe() expects a vector of vectors of record batches // here we need to wrap the vector of record batches in an additional vector - let batches = table.extract::>>()?; - let list_of_batches = PyArrowType::from(vec![batches.0]); - self.create_dataframe(list_of_batches, name, Some(schema), py) + let list_of_batches = PyArrowType::from(vec![batches]); + self.create_dataframe(list_of_batches, name, Some(schema.into()), py) } /// Construct datafusion dataframe from pandas @@ -504,7 +520,7 @@ impl PySessionContext { let table = table_class.call_method1("from_pandas", args)?; // Convert Arrow Table to datafusion DataFrame - let df = self.from_arrow_table(table, name, py)?; + let df = self.from_arrow(table, name, py)?; Ok(df) } @@ -518,7 +534,7 @@ impl PySessionContext { let table = data.call_method0("to_arrow")?; // Convert Arrow Table to datafusion DataFrame - let df = self.from_arrow_table(table, name, data.py())?; + let df = self.from_arrow(table, name, data.py())?; Ok(df) } diff --git a/src/dataframe.rs b/src/dataframe.rs index 4db59d4fe..22b05226c 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -15,8 +15,14 @@ // specific language governing permissions and limitations // under the License. +use std::ffi::CString; use std::sync::Arc; +use arrow::array::{new_null_array, RecordBatch, RecordBatchIterator, RecordBatchReader}; +use arrow::compute::can_cast_types; +use arrow::error::ArrowError; +use arrow::ffi::FFI_ArrowSchema; +use arrow::ffi_stream::FFI_ArrowArrayStream; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; @@ -29,7 +35,7 @@ use datafusion_common::UnnestOptions; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; -use pyo3::types::PyTuple; +use pyo3::types::{PyCapsule, PyTuple}; use tokio::task::JoinHandle; use crate::errors::py_datafusion_err; @@ -451,6 +457,39 @@ impl PyDataFrame { Ok(table) } + fn __arrow_c_stream__<'py>( + &'py mut self, + py: Python<'py>, + requested_schema: Option>, + ) -> PyResult> { + let mut batches = wait_for_future(py, self.df.as_ref().clone().collect())?; + let mut schema: Schema = self.df.schema().to_owned().into(); + + if let Some(schema_capsule) = requested_schema { + validate_pycapsule(&schema_capsule, "arrow_schema")?; + + let schema_ptr = unsafe { schema_capsule.reference::() }; + let desired_schema = Schema::try_from(schema_ptr).map_err(DataFusionError::from)?; + + schema = project_schema(schema, desired_schema).map_err(DataFusionError::ArrowError)?; + + batches = batches + .into_iter() + .map(|record_batch| record_batch_into_schema(record_batch, &schema)) + .collect::, ArrowError>>() + .map_err(DataFusionError::ArrowError)?; + } + + let batches_wrapped = batches.into_iter().map(Ok); + + let reader = RecordBatchIterator::new(batches_wrapped, Arc::new(schema)); + let reader: Box = Box::new(reader); + + let ffi_stream = FFI_ArrowArrayStream::new(reader); + let stream_capsule_name = CString::new("arrow_array_stream").unwrap(); + PyCapsule::new_bound(py, ffi_stream, Some(stream_capsule_name)) + } + fn execute_stream(&self, py: Python) -> PyResult { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime(py).0; @@ -539,3 +578,78 @@ fn print_dataframe(py: Python, df: DataFrame) -> PyResult<()> { print.call1((result,))?; Ok(()) } + +fn project_schema(from_schema: Schema, to_schema: Schema) -> Result { + let merged_schema = Schema::try_merge(vec![from_schema, to_schema.clone()])?; + + let project_indices: Vec = to_schema + .fields + .iter() + .map(|field| field.name()) + .filter_map(|field_name| merged_schema.index_of(field_name).ok()) + .collect(); + + merged_schema.project(&project_indices) +} + +fn record_batch_into_schema( + record_batch: RecordBatch, + schema: &Schema, +) -> Result { + let schema = Arc::new(schema.clone()); + let base_schema = record_batch.schema(); + if base_schema.fields().len() == 0 { + // Nothing to project + return Ok(RecordBatch::new_empty(schema)); + } + + let array_size = record_batch.column(0).len(); + let mut data_arrays = Vec::with_capacity(schema.fields().len()); + + for field in schema.fields() { + let desired_data_type = field.data_type(); + if let Some(original_data) = record_batch.column_by_name(field.name()) { + let original_data_type = original_data.data_type(); + + if can_cast_types(original_data_type, desired_data_type) { + data_arrays.push(arrow::compute::kernels::cast( + original_data, + desired_data_type, + )?); + } else if field.is_nullable() { + data_arrays.push(new_null_array(desired_data_type, array_size)); + } else { + return Err(ArrowError::CastError(format!("Attempting to cast to non-nullable and non-castable field {} during schema projection.", field.name()))); + } + } else { + if !field.is_nullable() { + return Err(ArrowError::CastError(format!( + "Attempting to set null to non-nullable field {} during schema projection.", + field.name() + ))); + } + data_arrays.push(new_null_array(desired_data_type, array_size)); + } + } + + RecordBatch::try_new(schema, data_arrays) +} + +fn validate_pycapsule(capsule: &Bound, name: &str) -> PyResult<()> { + let capsule_name = capsule.name()?; + if capsule_name.is_none() { + return Err(PyValueError::new_err( + "Expected schema PyCapsule to have name set.", + )); + } + + let capsule_name = capsule_name.unwrap().to_str()?; + if capsule_name != name { + return Err(PyValueError::new_err(format!( + "Expected name '{}' in PyCapsule, instead got '{}'", + name, capsule_name + ))); + } + + Ok(()) +} From 003eea8fe75bed9c12200ce296a8c7fa5212b534 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 30 Aug 2024 17:05:00 -0400 Subject: [PATCH 021/248] Feature/expose when function (#836) --- python/datafusion/functions.py | 11 +++++++++++ python/datafusion/tests/test_functions.py | 19 +++++++++++++++++++ src/functions.rs | 9 +++++++++ 3 files changed, 39 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 59a1974fd..ec0c1104d 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -245,6 +245,7 @@ "var", "var_pop", "var_samp", + "when", "window", ] @@ -364,6 +365,16 @@ def case(expr: Expr) -> CaseBuilder: return CaseBuilder(f.case(expr.expr)) +def when(when: Expr, then: Expr) -> CaseBuilder: + """Create a case expression that has no base expression. + + Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the + expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for + detailed usage. + """ + return CaseBuilder(f.when(when.expr, then.expr)) + + def window( name: str, args: list[Expr], diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index 732136eaa..e5429bd60 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -836,6 +836,25 @@ def test_case(df): assert result.column(2) == pa.array(["Hola", "Mundo", None]) +def test_when_with_no_base(df): + df.show() + df = df.select( + column("b"), + f.when(column("b") > literal(5), literal("too big")) + .when(column("b") < literal(5), literal("too small")) + .otherwise(literal("just right")) + .alias("goldilocks"), + f.when(column("a") == literal("Hello"), column("a")).end().alias("greeting"), + ) + df.show() + + result = df.collect() + result = result[0] + assert result.column(0) == pa.array([4, 5, 6]) + assert result.column(1) == pa.array(["too small", "just right", "too big"]) + assert result.column(2) == pa.array(["Hello", None, None]) + + def test_regr_funcs_sql(df): # test case base on # https://github.com/apache/arrow-datafusion/blob/d1361d56b9a9e0c165d3d71a8df6795d2a5f51dd/datafusion/core/tests/sqllogictests/test_files/aggregate.slt#L2330 diff --git a/src/functions.rs b/src/functions.rs index c53d4ad92..252563621 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -533,6 +533,14 @@ fn case(expr: PyExpr) -> PyResult { }) } +/// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression. +#[pyfunction] +fn when(when: PyExpr, then: PyExpr) -> PyResult { + Ok(PyCaseBuilder { + case_builder: datafusion_expr::when(when.expr, then.expr), + }) +} + /// Helper function to find the appropriate window function. /// /// Search procedure: @@ -910,6 +918,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(char_length))?; m.add_wrapped(wrap_pyfunction!(coalesce))?; m.add_wrapped(wrap_pyfunction!(case))?; + m.add_wrapped(wrap_pyfunction!(when))?; m.add_wrapped(wrap_pyfunction!(col))?; m.add_wrapped(wrap_pyfunction!(concat_ws))?; m.add_wrapped(wrap_pyfunction!(concat))?; From 90f5b5b355d79b56ae3607b7b0cdeb09b67e5121 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 2 Sep 2024 10:51:00 -0400 Subject: [PATCH 022/248] Add Window Functions for use with function builder (#808) * Add window function as template for others and function builder * Adding docstrings * Change last_value to use function builder instead of explicitly passing values * Allow any value for lead function default value and add unit test * Add lead window function and unit tests * Temporarily commenting out deprecated functions in documenation so builder will pass * Expose row_number window function * Add rank window function * Add percent rank and dense rank * Add cume_dist * Add ntile window function * Add comment to update when upstream merges * Window frame required calling inner value * Add unit test for avg as window function * Working on documentation for window functions * Add pyo build config file to git ignore since this is user specific * Add examples to docstring * Optionally add window function parameters during function call * Update sort and order_by to apply automatic ordering if any other expression is given * Update unit tests to be cleaner and use default sort on expressions * Ignore vscode folder specific settings * Window frames should only apply to aggregate functions used as window functions. Also pass in scalar pyarrow values so we can set a range other than a uint * Remove deprecated warning until we actually have a way to use all functions without calling window() * Built in window functions do not have any impact by setting null_treatment so remove from user facing * Update user documentation on how to pass parameters for different window functions and what their impacts are * Make first_value and last_value identical in the interface --- .gitignore | 4 + .../common-operations/aggregations.rst | 2 + .../user-guide/common-operations/windows.rst | 187 +++++++-- python/datafusion/dataframe.py | 7 +- python/datafusion/expr.py | 113 ++++- python/datafusion/functions.py | 390 +++++++++++++++++- python/datafusion/tests/test_dataframe.py | 182 ++++++-- python/datafusion/tests/test_functions.py | 1 + src/dataframe.rs | 3 +- src/expr.rs | 110 ++++- src/expr/window.rs | 12 +- src/functions.rs | 176 ++++++-- 12 files changed, 1059 insertions(+), 128 deletions(-) diff --git a/.gitignore b/.gitignore index 0030b907b..aaeaaa5b1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ target /docs/temp /docs/build .DS_Store +.vscode # Byte-compiled / optimized / DLL files __pycache__/ @@ -31,3 +32,6 @@ apache-rat-*.jar CHANGELOG.md.bak docs/mdbook/book + +.pyo3_build_config + diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst index b9202129e..7ad402210 100644 --- a/docs/source/user-guide/common-operations/aggregations.rst +++ b/docs/source/user-guide/common-operations/aggregations.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _aggregation: + Aggregation ============ diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst index 5ef3c986c..609176897 100644 --- a/docs/source/user-guide/common-operations/windows.rst +++ b/docs/source/user-guide/common-operations/windows.rst @@ -15,13 +15,16 @@ .. specific language governing permissions and limitations .. under the License. +.. _window_functions: + Window Functions ================ -In this section you will learn about window functions. A window function utilizes values from one or multiple rows to -produce a result for each individual row, unlike an aggregate function that provides a single value for multiple rows. +In this section you will learn about window functions. A window function utilizes values from one or +multiple rows to produce a result for each individual row, unlike an aggregate function that +provides a single value for multiple rows. -The functionality of window functions in DataFusion is supported by the dedicated :py:func:`~datafusion.functions.window` function. +The window functions are availble in the :py:mod:`~datafusion.functions` module. We'll use the pokemon dataset (from Ritchie Vink) in the following examples. @@ -40,20 +43,25 @@ We'll use the pokemon dataset (from Ritchie Vink) in the following examples. ctx = SessionContext() df = ctx.read_csv("pokemon.csv") -Here is an example that shows how to compare each pokemons’s attack power with the average attack power in its ``"Type 1"`` +Here is an example that shows how you can compare each pokemon's speed to the speed of the +previous row in the DataFrame. .. ipython:: python df.select( col('"Name"'), - col('"Attack"'), - f.alias( - f.window("avg", [col('"Attack"')], partition_by=[col('"Type 1"')]), - "Average Attack", - ) + col('"Speed"'), + f.lag(col('"Speed"')).alias("Previous Speed") ) -You can also control the order in which rows are processed by window functions by providing +Setting Parameters +------------------ + + +Ordering +^^^^^^^^ + +You can control the order in which rows are processed by window functions by providing a list of ``order_by`` functions for the ``order_by`` parameter. .. ipython:: python @@ -61,33 +69,150 @@ a list of ``order_by`` functions for the ``order_by`` parameter. df.select( col('"Name"'), col('"Attack"'), - f.alias( - f.window( - "rank", - [], - partition_by=[col('"Type 1"')], - order_by=[f.order_by(col('"Attack"'))], - ), - "rank", - ), + col('"Type 1"'), + f.rank( + partition_by=[col('"Type 1"')], + order_by=[col('"Attack"').sort(ascending=True)], + ).alias("rank"), + ).sort(col('"Type 1"'), col('"Attack"')) + +Partitions +^^^^^^^^^^ + +A window function can take a list of ``partition_by`` columns similar to an +:ref:`Aggregation Function`. This will cause the window values to be evaluated +independently for each of the partitions. In the example above, we found the rank of each +Pokemon per ``Type 1`` partitions. We can see the first couple of each partition if we do +the following: + +.. ipython:: python + + df.select( + col('"Name"'), + col('"Attack"'), + col('"Type 1"'), + f.rank( + partition_by=[col('"Type 1"')], + order_by=[col('"Attack"').sort(ascending=True)], + ).alias("rank"), + ).filter(col("rank") < lit(3)).sort(col('"Type 1"'), col("rank")) + +Window Frame +^^^^^^^^^^^^ + +When using aggregate functions, the Window Frame of defines the rows over which it operates. +If you do not specify a Window Frame, the frame will be set depending on the following +criteria. + +* If an ``order_by`` clause is set, the default window frame is defined as the rows between + unbounded preceeding and the current row. +* If an ``order_by`` is not set, the default frame is defined as the rows betwene unbounded + and unbounded following (the entire partition). + +Window Frames are defined by three parameters: unit type, starting bound, and ending bound. + +The unit types available are: + +* Rows: The starting and ending boundaries are defined by the number of rows relative to the + current row. +* Range: When using Range, the ``order_by`` clause must have exactly one term. The boundaries + are defined bow how close the rows are to the value of the expression in the ``order_by`` + parameter. +* Groups: A "group" is the set of all rows that have equivalent values for all terms in the + ``order_by`` clause. + +In this example we perform a "rolling average" of the speed of the current Pokemon and the +two preceeding rows. + +.. ipython:: python + + from datafusion.expr import WindowFrame + + df.select( + col('"Name"'), + col('"Speed"'), + f.window("avg", + [col('"Speed"')], + order_by=[col('"Speed"')], + window_frame=WindowFrame("rows", 2, 0) + ).alias("Previous Speed") + ) + +Null Treatment +^^^^^^^^^^^^^^ + +When using aggregate functions as window functions, it is often useful to specify how null values +should be treated. In order to do this you need to use the builder function. In future releases +we expect this to be simplified in the interface. + +One common usage for handling nulls is the case where you want to find the last value up to the +current row. In the following example we demonstrate how setting the null treatment to ignore +nulls will fill in with the value of the most recent non-null row. To do this, we also will set +the window frame so that we only process up to the current row. + +In this example, we filter down to one specific type of Pokemon that does have some entries in +it's ``Type 2`` column that are null. + +.. ipython:: python + + from datafusion.common import NullTreatment + + df.filter(col('"Type 1"') == lit("Bug")).select( + '"Name"', + '"Type 2"', + f.window("last_value", [col('"Type 2"')]) + .window_frame(WindowFrame("rows", None, 0)) + .order_by(col('"Speed"')) + .null_treatment(NullTreatment.IGNORE_NULLS) + .build() + .alias("last_wo_null"), + f.window("last_value", [col('"Type 2"')]) + .window_frame(WindowFrame("rows", None, 0)) + .order_by(col('"Speed"')) + .null_treatment(NullTreatment.RESPECT_NULLS) + .build() + .alias("last_with_null") + ) + +Aggregate Functions +------------------- + +You can use any :ref:`Aggregation Function` as a window function. Currently +aggregate functions must use the deprecated +:py:func:`datafusion.functions.window` API but this should be resolved in +DataFusion 42.0 (`Issue Link `_). Here +is an example that shows how to compare each pokemons’s attack power with the average attack +power in its ``"Type 1"`` using the :py:func:`datafusion.functions.avg` function. + +.. ipython:: python + :okwarning: + + df.select( + col('"Name"'), + col('"Attack"'), + col('"Type 1"'), + f.window("avg", [col('"Attack"')]) + .partition_by(col('"Type 1"')) + .build() + .alias("Average Attack"), ) +Available Functions +------------------- + The possible window functions are: 1. Rank Functions - - rank - - dense_rank - - row_number - - ntile + - :py:func:`datafusion.functions.rank` + - :py:func:`datafusion.functions.dense_rank` + - :py:func:`datafusion.functions.ntile` + - :py:func:`datafusion.functions.row_number` 2. Analytical Functions - - cume_dist - - percent_rank - - lag - - lead - - first_value - - last_value - - nth_value + - :py:func:`datafusion.functions.cume_dist` + - :py:func:`datafusion.functions.percent_rank` + - :py:func:`datafusion.functions.lag` + - :py:func:`datafusion.functions.lead` 3. Aggregate Functions - - All aggregate functions can be used as window functions. + - All :ref:`Aggregation Functions` can be used as window functions. diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 4f1760135..0e7d82e29 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -123,11 +123,10 @@ def select(self, *exprs: Expr | str) -> DataFrame: df = df.select("a", col("b"), col("a").alias("alternate_a")) """ - exprs = [ - arg.expr if isinstance(arg, Expr) else Expr.column(arg).expr - for arg in exprs + exprs_internal = [ + Expr.column(arg).expr if isinstance(arg, str) else arg.expr for arg in exprs ] - return DataFrame(self.df.select(*exprs)) + return DataFrame(self.df.select(*exprs_internal)) def filter(self, *predicates: Expr) -> DataFrame: """Return a DataFrame for which ``predicate`` evaluates to ``True``. diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 71fcf397b..c7272bb3b 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -23,8 +23,8 @@ from __future__ import annotations from ._internal import expr as expr_internal, LogicalPlan -from datafusion.common import RexType, DataTypeMap -from typing import Any +from datafusion.common import NullTreatment, RexType, DataTypeMap +from typing import Any, Optional import pyarrow as pa # The following are imported from the internal representation. We may choose to @@ -355,6 +355,10 @@ def is_null(self) -> Expr: """Returns ``True`` if this expression is null.""" return Expr(self.expr.is_null()) + def is_not_null(self) -> Expr: + """Returns ``True`` if this expression is not null.""" + return Expr(self.expr.is_not_null()) + def cast(self, to: pa.DataType[Any]) -> Expr: """Cast to a new data type.""" return Expr(self.expr.cast(to)) @@ -405,12 +409,107 @@ def column_name(self, plan: LogicalPlan) -> str: """Compute the output column name based on the provided logical plan.""" return self.expr.column_name(plan) + def order_by(self, *exprs: Expr) -> ExprFuncBuilder: + """Set the ordering for a window or aggregate function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.order_by(list(e.expr for e in exprs))) + + def filter(self, filter: Expr) -> ExprFuncBuilder: + """Filter an aggregate function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.filter(filter.expr)) + + def distinct(self) -> ExprFuncBuilder: + """Only evaluate distinct values for an aggregate function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.distinct()) + + def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: + """Set the treatment for ``null`` values for a window or aggregate function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.null_treatment(null_treatment)) + + def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: + """Set the partitioning for a window function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder( + self.expr.partition_by(list(e.expr for e in partition_by)) + ) + + def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: + """Set the frame fora window function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.window_frame(window_frame.window_frame)) + + +class ExprFuncBuilder: + def __init__(self, builder: expr_internal.ExprFuncBuilder): + self.builder = builder + + def order_by(self, *exprs: Expr) -> ExprFuncBuilder: + """Set the ordering for a window or aggregate function. + + Values given in ``exprs`` must be sort expressions. You can convert any other + expression to a sort expression using `.sort()`. + """ + return ExprFuncBuilder(self.builder.order_by(list(e.expr for e in exprs))) + + def filter(self, filter: Expr) -> ExprFuncBuilder: + """Filter values during aggregation.""" + return ExprFuncBuilder(self.builder.filter(filter.expr)) + + def distinct(self) -> ExprFuncBuilder: + """Only evaluate distinct values during aggregation.""" + return ExprFuncBuilder(self.builder.distinct()) + + def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: + """Set how nulls are treated for either window or aggregate functions.""" + return ExprFuncBuilder(self.builder.null_treatment(null_treatment)) + + def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: + """Set partitioning for window functions.""" + return ExprFuncBuilder( + self.builder.partition_by(list(e.expr for e in partition_by)) + ) + + def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: + """Set window frame for window functions.""" + return ExprFuncBuilder(self.builder.window_frame(window_frame.window_frame)) + + def build(self) -> Expr: + """Create an expression from a Function Builder.""" + return Expr(self.builder.build()) + class WindowFrame: """Defines a window frame for performing window operations.""" def __init__( - self, units: str, start_bound: int | None, end_bound: int | None + self, units: str, start_bound: Optional[Any], end_bound: Optional[Any] ) -> None: """Construct a window frame using the given parameters. @@ -423,6 +522,14 @@ def __init__( will be set to unbounded. If unit type is ``groups``, this parameter must be set. """ + if not isinstance(start_bound, pa.Scalar) and start_bound is not None: + start_bound = pa.scalar(start_bound) + if units == "rows" or units == "groups": + start_bound = start_bound.cast(pa.uint64()) + if not isinstance(end_bound, pa.Scalar) and end_bound is not None: + end_bound = pa.scalar(end_bound) + if units == "rows" or units == "groups": + end_bound = end_bound.cast(pa.uint64()) self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound) def get_frame_units(self) -> str: diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index ec0c1104d..28201c1d1 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -27,6 +27,10 @@ from datafusion.expr import CaseBuilder, Expr, WindowFrame from datafusion.context import SessionContext +from typing import Any, Optional + +import pyarrow as pa + __all__ = [ "abs", "acos", @@ -246,7 +250,16 @@ "var_pop", "var_samp", "when", + # Window Functions "window", + "lead", + "lag", + "row_number", + "rank", + "dense_rank", + "percent_rank", + "cume_dist", + "ntile", ] @@ -383,7 +396,14 @@ def window( window_frame: WindowFrame | None = None, ctx: SessionContext | None = None, ) -> Expr: - """Creates a new Window function expression.""" + """Creates a new Window function expression. + + This interface will soon be deprecated. Instead of using this interface, + users should call the window functions directly. For example, to perform a + lag use:: + + df.select(functions.lag(col("a")).partition_by(col("b")).build()) + """ args = [a.expr for a in args] partition_by = [e.expr for e in partition_by] if partition_by is not None else None order_by = [o.expr for o in order_by] if order_by is not None else None @@ -1022,12 +1042,12 @@ def struct(*args: Expr) -> Expr: return Expr(f.struct(*args)) -def named_struct(name_pairs: list[(str, Expr)]) -> Expr: +def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr: """Returns a struct with the given names and arguments pairs.""" - name_pairs = [[Expr.literal(pair[0]), pair[1]] for pair in name_pairs] + name_pair_exprs = [[Expr.literal(pair[0]), pair[1]] for pair in name_pairs] # flatten - name_pairs = [x.expr for xs in name_pairs for x in xs] + name_pairs = [x.expr for xs in name_pair_exprs for x in xs] return Expr(f.named_struct(*name_pairs)) @@ -1690,17 +1710,19 @@ def regr_syy(y: Expr, x: Expr, distinct: bool = False) -> Expr: def first_value( arg: Expr, distinct: bool = False, - filter: bool = None, - order_by: Expr | None = None, - null_treatment: common.NullTreatment | None = None, + filter: Optional[bool] = None, + order_by: Optional[list[Expr]] = None, + null_treatment: Optional[common.NullTreatment] = None, ) -> Expr: """Returns the first value in a group of values.""" + order_by_cols = [e.expr for e in order_by] if order_by is not None else None + return Expr( f.first_value( arg.expr, distinct=distinct, filter=filter, - order_by=order_by, + order_by=order_by_cols, null_treatment=null_treatment, ) ) @@ -1709,17 +1731,23 @@ def first_value( def last_value( arg: Expr, distinct: bool = False, - filter: bool = None, - order_by: Expr | None = None, - null_treatment: common.NullTreatment | None = None, + filter: Optional[bool] = None, + order_by: Optional[list[Expr]] = None, + null_treatment: Optional[common.NullTreatment] = None, ) -> Expr: - """Returns the last value in a group of values.""" + """Returns the last value in a group of values. + + To set parameters on this expression, use ``.order_by()``, ``.distinct()``, + ``.filter()``, or ``.null_treatment()``. + """ + order_by_cols = [e.expr for e in order_by] if order_by is not None else None + return Expr( f.last_value( arg.expr, distinct=distinct, filter=filter, - order_by=order_by, + order_by=order_by_cols, null_treatment=null_treatment, ) ) @@ -1748,3 +1776,339 @@ def bool_and(arg: Expr, distinct: bool = False) -> Expr: def bool_or(arg: Expr, distinct: bool = False) -> Expr: """Computes the boolean OR of the arguement.""" return Expr(f.bool_or(arg.expr, distinct=distinct)) + + +def lead( + arg: Expr, + shift_offset: int = 1, + default_value: Optional[Any] = None, + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Create a lead window function. + + Lead operation will return the argument that is in the next shift_offset-th row in + the partition. For example ``lead(col("b"), shift_offset=3, default_value=5)`` will + return the 3rd following value in column ``b``. At the end of the partition, where + no futher values can be returned it will return the default value of 5. + + Here is an example of both the ``lead`` and :py:func:`datafusion.functions.lag` + functions on a simple DataFrame:: + + +--------+------+-----+ + | points | lead | lag | + +--------+------+-----+ + | 100 | 100 | | + | 100 | 50 | 100 | + | 50 | 25 | 100 | + | 25 | | 50 | + +--------+------+-----+ + + To set window function parameters use the window builder approach described in the + ref:`_window_functions` online documentation. + + Args: + arg: Value to return + shift_offset: Number of rows following the current row. + default_value: Value to return if shift_offet row does not exist. + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + if not isinstance(default_value, pa.Scalar) and default_value is not None: + default_value = pa.scalar(default_value) + + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_cols = [col.expr for col in order_by] if order_by is not None else None + + return Expr( + f.lead( + arg.expr, + shift_offset, + default_value, + partition_by=partition_cols, + order_by=order_cols, + ) + ) + + +def lag( + arg: Expr, + shift_offset: int = 1, + default_value: Optional[Any] = None, + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Create a lag window function. + + Lag operation will return the argument that is in the previous shift_offset-th row + in the partition. For example ``lag(col("b"), shift_offset=3, default_value=5)`` + will return the 3rd previous value in column ``b``. At the beginnig of the + partition, where no values can be returned it will return the default value of 5. + + Here is an example of both the ``lag`` and :py:func:`datafusion.functions.lead` + functions on a simple DataFrame:: + + +--------+------+-----+ + | points | lead | lag | + +--------+------+-----+ + | 100 | 100 | | + | 100 | 50 | 100 | + | 50 | 25 | 100 | + | 25 | | 50 | + +--------+------+-----+ + + Args: + arg: Value to return + shift_offset: Number of rows before the current row. + default_value: Value to return if shift_offet row does not exist. + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + if not isinstance(default_value, pa.Scalar): + default_value = pa.scalar(default_value) + + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_cols = [col.expr for col in order_by] if order_by is not None else None + + return Expr( + f.lag( + arg.expr, + shift_offset, + default_value, + partition_by=partition_cols, + order_by=order_cols, + ) + ) + + +def row_number( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Create a row number window function. + + Returns the row number of the window function. + + Here is an example of the ``row_number`` on a simple DataFrame:: + + +--------+------------+ + | points | row number | + +--------+------------+ + | 100 | 1 | + | 100 | 2 | + | 50 | 3 | + | 25 | 4 | + +--------+------------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_cols = [col.expr for col in order_by] if order_by is not None else None + + return Expr( + f.row_number( + partition_by=partition_cols, + order_by=order_cols, + ) + ) + + +def rank( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Create a rank window function. + + Returns the rank based upon the window order. Consecutive equal values will receive + the same rank, but the next different value will not be consecutive but rather the + number of rows that preceed it plus one. This is similar to Olympic medals. If two + people tie for gold, the next place is bronze. There would be no silver medal. Here + is an example of a dataframe with a window ordered by descending ``points`` and the + associated rank. + + You should set ``order_by`` to produce meaningful results:: + + +--------+------+ + | points | rank | + +--------+------+ + | 100 | 1 | + | 100 | 1 | + | 50 | 3 | + | 25 | 4 | + +--------+------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_cols = [col.expr for col in order_by] if order_by is not None else None + + return Expr( + f.rank( + partition_by=partition_cols, + order_by=order_cols, + ) + ) + + +def dense_rank( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Create a dense_rank window function. + + This window function is similar to :py:func:`rank` except that the returned values + will be consecutive. Here is an example of a dataframe with a window ordered by + descending ``points`` and the associated dense rank:: + + +--------+------------+ + | points | dense_rank | + +--------+------------+ + | 100 | 1 | + | 100 | 1 | + | 50 | 2 | + | 25 | 3 | + +--------+------------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_cols = [col.expr for col in order_by] if order_by is not None else None + + return Expr( + f.dense_rank( + partition_by=partition_cols, + order_by=order_cols, + ) + ) + + +def percent_rank( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Create a percent_rank window function. + + This window function is similar to :py:func:`rank` except that the returned values + are the percentage from 0.0 to 1.0 from first to last. Here is an example of a + dataframe with a window ordered by descending ``points`` and the associated percent + rank:: + + +--------+--------------+ + | points | percent_rank | + +--------+--------------+ + | 100 | 0.0 | + | 100 | 0.0 | + | 50 | 0.666667 | + | 25 | 1.0 | + +--------+--------------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_cols = [col.expr for col in order_by] if order_by is not None else None + + return Expr( + f.percent_rank( + partition_by=partition_cols, + order_by=order_cols, + ) + ) + + +def cume_dist( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Create a cumulative distribution window function. + + This window function is similar to :py:func:`rank` except that the returned values + are the ratio of the row number to the total numebr of rows. Here is an example of a + dataframe with a window ordered by descending ``points`` and the associated + cumulative distribution:: + + +--------+-----------+ + | points | cume_dist | + +--------+-----------+ + | 100 | 0.5 | + | 100 | 0.5 | + | 50 | 0.75 | + | 25 | 1.0 | + +--------+-----------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_cols = [col.expr for col in order_by] if order_by is not None else None + + return Expr( + f.cume_dist( + partition_by=partition_cols, + order_by=order_cols, + ) + ) + + +def ntile( + groups: int, + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Create a n-tile window function. + + This window function orders the window frame into a give number of groups based on + the ordering criteria. It then returns which group the current row is assigned to. + Here is an example of a dataframe with a window ordered by descending ``points`` + and the associated n-tile function:: + + +--------+-------+ + | points | ntile | + +--------+-------+ + | 120 | 1 | + | 100 | 1 | + | 80 | 2 | + | 60 | 2 | + | 40 | 3 | + | 20 | 3 | + +--------+-------+ + + Args: + groups: Number of groups for the n-tile to be divided into. + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_cols = [col.expr for col in order_by] if order_by is not None else None + + return Expr( + f.ntile( + Expr.literal(groups).expr, + partition_by=partition_cols, + order_by=order_cols, + ) + ) diff --git a/python/datafusion/tests/test_dataframe.py b/python/datafusion/tests/test_dataframe.py index 477bc0fce..c2a5f22ba 100644 --- a/python/datafusion/tests/test_dataframe.py +++ b/python/datafusion/tests/test_dataframe.py @@ -84,6 +84,23 @@ def aggregate_df(): return ctx.sql("select c1, sum(c2) from test group by c1") +@pytest.fixture +def partitioned_df(): + ctx = SessionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [ + pa.array([0, 1, 2, 3, 4, 5, 6]), + pa.array([7, None, 7, 8, 9, None, 9]), + pa.array(["A", "A", "A", "A", "B", "B", "B"]), + ], + names=["a", "b", "c"], + ) + + return ctx.create_dataframe([[batch]]) + + def test_select(df): df = df.select( column("a") + column("b"), @@ -249,7 +266,7 @@ def test_join(): df = df.join(df1, join_keys=(["a"], ["a"]), how="inner") df.show() - df = df.sort(column("l.a").sort(ascending=True)) + df = df.sort(column("l.a")) table = pa.Table.from_batches(df.collect()) expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} @@ -263,83 +280,162 @@ def test_distinct(): [pa.array([1, 2, 3, 1, 2, 3]), pa.array([4, 5, 6, 4, 5, 6])], names=["a", "b"], ) - df_a = ( - ctx.create_dataframe([[batch]]) - .distinct() - .sort(column("a").sort(ascending=True)) - ) + df_a = ctx.create_dataframe([[batch]]).distinct().sort(column("a")) batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) - df_b = ctx.create_dataframe([[batch]]).sort(column("a").sort(ascending=True)) + df_b = ctx.create_dataframe([[batch]]).sort(column("a")) assert df_a.collect() == df_b.collect() data_test_window_functions = [ - ("row", f.window("row_number", [], order_by=[f.order_by(column("c"))]), [2, 1, 3]), - ("rank", f.window("rank", [], order_by=[f.order_by(column("c"))]), [2, 1, 2]), + ( + "row", + f.row_number(order_by=[column("b"), column("a").sort(ascending=False)]), + [4, 2, 3, 5, 7, 1, 6], + ), + ( + "row_w_params", + f.row_number( + order_by=[column("b"), column("a")], + partition_by=[column("c")], + ), + [2, 1, 3, 4, 2, 1, 3], + ), + ("rank", f.rank(order_by=[column("b")]), [3, 1, 3, 5, 6, 1, 6]), + ( + "rank_w_params", + f.rank(order_by=[column("b"), column("a")], partition_by=[column("c")]), + [2, 1, 3, 4, 2, 1, 3], + ), ( "dense_rank", - f.window("dense_rank", [], order_by=[f.order_by(column("c"))]), - [2, 1, 2], + f.dense_rank(order_by=[column("b")]), + [2, 1, 2, 3, 4, 1, 4], + ), + ( + "dense_rank_w_params", + f.dense_rank(order_by=[column("b"), column("a")], partition_by=[column("c")]), + [2, 1, 3, 4, 2, 1, 3], ), ( "percent_rank", - f.window("percent_rank", [], order_by=[f.order_by(column("c"))]), - [0.5, 0, 0.5], + f.round(f.percent_rank(order_by=[column("b")]), literal(3)), + [0.333, 0.0, 0.333, 0.667, 0.833, 0.0, 0.833], + ), + ( + "percent_rank_w_params", + f.round( + f.percent_rank( + order_by=[column("b"), column("a")], partition_by=[column("c")] + ), + literal(3), + ), + [0.333, 0.0, 0.667, 1.0, 0.5, 0.0, 1.0], ), ( "cume_dist", - f.window("cume_dist", [], order_by=[f.order_by(column("b"))]), - [0.3333333333333333, 0.6666666666666666, 1.0], + f.round(f.cume_dist(order_by=[column("b")]), literal(3)), + [0.571, 0.286, 0.571, 0.714, 1.0, 0.286, 1.0], + ), + ( + "cume_dist_w_params", + f.round( + f.cume_dist( + order_by=[column("b"), column("a")], partition_by=[column("c")] + ), + literal(3), + ), + [0.5, 0.25, 0.75, 1.0, 0.667, 0.333, 1.0], ), ( "ntile", - f.window("ntile", [literal(2)], order_by=[f.order_by(column("c"))]), - [1, 1, 2], + f.ntile(2, order_by=[column("b")]), + [1, 1, 1, 2, 2, 1, 2], ), ( - "next", - f.window("lead", [column("b")], order_by=[f.order_by(column("b"))]), - [5, 6, None], + "ntile_w_params", + f.ntile(2, order_by=[column("b"), column("a")], partition_by=[column("c")]), + [1, 1, 2, 2, 1, 1, 2], ), + ("lead", f.lead(column("b"), order_by=[column("b")]), [7, None, 8, 9, 9, 7, None]), ( - "previous", - f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]), - [None, 4, 5], + "lead_w_params", + f.lead( + column("b"), + shift_offset=2, + default_value=-1, + order_by=[column("b"), column("a")], + partition_by=[column("c")], + ), + [8, 7, -1, -1, -1, 9, -1], ), + ("lag", f.lag(column("b"), order_by=[column("b")]), [None, None, 7, 7, 8, None, 9]), + ( + "lag_w_params", + f.lag( + column("b"), + shift_offset=2, + default_value=-1, + order_by=[column("b"), column("a")], + partition_by=[column("c")], + ), + [-1, -1, None, 7, -1, -1, None], + ), + # TODO update all aggregate functions as windows once upstream merges https://github.com/apache/datafusion-python/issues/833 pytest.param( "first_value", - f.window("first_value", [column("a")], order_by=[f.order_by(column("b"))]), - [1, 1, 1], + f.window( + "first_value", + [column("a")], + order_by=[f.order_by(column("b"))], + partition_by=[column("c")], + ), + [1, 1, 1, 1, 5, 5, 5], ), pytest.param( "last_value", - f.window("last_value", [column("b")], order_by=[f.order_by(column("b"))]), - [4, 5, 6], + f.window("last_value", [column("a")]) + .window_frame(WindowFrame("rows", 0, None)) + .order_by(column("b")) + .partition_by(column("c")) + .build(), + [3, 3, 3, 3, 6, 6, 6], ), pytest.param( - "2nd_value", + "3rd_value", f.window( "nth_value", - [column("b"), literal(2)], - order_by=[f.order_by(column("b"))], + [column("b"), literal(3)], + order_by=[f.order_by(column("a"))], ), - [None, 5, 5], + [None, None, 7, 7, 7, 7, 7], + ), + pytest.param( + "avg", + f.round(f.window("avg", [column("b")], order_by=[column("a")]), literal(3)), + [7.0, 7.0, 7.0, 7.333, 7.75, 7.75, 8.0], ), ] @pytest.mark.parametrize("name,expr,result", data_test_window_functions) -def test_window_functions(df, name, expr, result): - df = df.select(column("a"), column("b"), column("c"), f.alias(expr, name)) - +def test_window_functions(partitioned_df, name, expr, result): + df = partitioned_df.select( + column("a"), column("b"), column("c"), f.alias(expr, name) + ) + df.sort(column("a")).show() table = pa.Table.from_batches(df.collect()) - expected = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8], name: result} + expected = { + "a": [0, 1, 2, 3, 4, 5, 6], + "b": [7, None, 7, 8, 9, None, 9], + "c": ["A", "A", "A", "A", "B", "B", "B"], + name: result, + } assert table.sort_by("a").to_pydict() == expected @@ -512,9 +608,9 @@ def test_intersect(): [pa.array([3]), pa.array([6])], names=["a", "b"], ) - df_c = ctx.create_dataframe([[batch]]).sort(column("a").sort(ascending=True)) + df_c = ctx.create_dataframe([[batch]]).sort(column("a")) - df_a_i_b = df_a.intersect(df_b).sort(column("a").sort(ascending=True)) + df_a_i_b = df_a.intersect(df_b).sort(column("a")) assert df_c.collect() == df_a_i_b.collect() @@ -538,9 +634,9 @@ def test_except_all(): [pa.array([1, 2]), pa.array([4, 5])], names=["a", "b"], ) - df_c = ctx.create_dataframe([[batch]]).sort(column("a").sort(ascending=True)) + df_c = ctx.create_dataframe([[batch]]).sort(column("a")) - df_a_e_b = df_a.except_all(df_b).sort(column("a").sort(ascending=True)) + df_a_e_b = df_a.except_all(df_b).sort(column("a")) assert df_c.collect() == df_a_e_b.collect() @@ -573,9 +669,9 @@ def test_union(ctx): [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])], names=["a", "b"], ) - df_c = ctx.create_dataframe([[batch]]).sort(column("a").sort(ascending=True)) + df_c = ctx.create_dataframe([[batch]]).sort(column("a")) - df_a_u_b = df_a.union(df_b).sort(column("a").sort(ascending=True)) + df_a_u_b = df_a.union(df_b).sort(column("a")) assert df_c.collect() == df_a_u_b.collect() @@ -597,9 +693,9 @@ def test_union_distinct(ctx): [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])], names=["a", "b"], ) - df_c = ctx.create_dataframe([[batch]]).sort(column("a").sort(ascending=True)) + df_c = ctx.create_dataframe([[batch]]).sort(column("a")) - df_a_u_b = df_a.union(df_b, True).sort(column("a").sort(ascending=True)) + df_a_u_b = df_a.union(df_b, True).sort(column("a")) assert df_c.collect() == df_a_u_b.collect() assert df_c.collect() == df_a_u_b.collect() diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index e5429bd60..fe092c456 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -963,6 +963,7 @@ def test_first_last_value(df): assert result.column(3) == pa.array(["!"]) assert result.column(4) == pa.array([6]) assert result.column(5) == pa.array([datetime(2020, 7, 2)]) + df.show() def test_binary_string_functions(df): diff --git a/src/dataframe.rs b/src/dataframe.rs index 22b05226c..d7abab400 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -39,6 +39,7 @@ use pyo3::types::{PyCapsule, PyTuple}; use tokio::task::JoinHandle; use crate::errors::py_datafusion_err; +use crate::expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; @@ -150,7 +151,7 @@ impl PyDataFrame { #[pyo3(signature = (*exprs))] fn sort(&self, exprs: Vec) -> PyResult { - let exprs = exprs.into_iter().map(|e| e.into()).collect(); + let exprs = to_sort_expressions(exprs); let df = self.df.as_ref().clone().sort(exprs)?; Ok(Self::new(df)) } diff --git a/src/expr.rs b/src/expr.rs index 04bfc85c2..697682d4c 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -16,10 +16,11 @@ // under the License. use datafusion_expr::utils::exprlist_to_fields; -use datafusion_expr::LogicalPlan; +use datafusion_expr::{ExprFuncBuilder, ExprFunctionExt, LogicalPlan}; use pyo3::{basic::CompareOp, prelude::*}; use std::convert::{From, Into}; use std::sync::Arc; +use window::PyWindowFrame; use arrow::pyarrow::ToPyArrow; use datafusion::arrow::datatypes::{DataType, Field}; @@ -32,7 +33,7 @@ use datafusion_expr::{ lit, Between, BinaryExpr, Case, Cast, Expr, Like, Operator, TryCast, }; -use crate::common::data_type::{DataTypeMap, RexType}; +use crate::common::data_type::{DataTypeMap, NullTreatment, RexType}; use crate::errors::{py_runtime_err, py_type_err, py_unsupported_variant_err, DataFusionError}; use crate::expr::aggregate_expr::PyAggregateFunction; use crate::expr::binary_expr::PyBinaryExpr; @@ -281,6 +282,10 @@ impl PyExpr { self.expr.clone().is_null().into() } + pub fn is_not_null(&self) -> PyExpr { + self.expr.clone().is_not_null().into() + } + pub fn cast(&self, to: PyArrowType) -> PyExpr { // self.expr.cast_to() requires DFSchema to validate that the cast // is supported, omit that for now @@ -510,6 +515,107 @@ impl PyExpr { pub fn column_name(&self, plan: PyLogicalPlan) -> PyResult { self._column_name(&plan.plan()).map_err(py_runtime_err) } + + // Expression Function Builder functions + + pub fn order_by(&self, order_by: Vec) -> PyExprFuncBuilder { + self.expr + .clone() + .order_by(to_sort_expressions(order_by)) + .into() + } + + pub fn filter(&self, filter: PyExpr) -> PyExprFuncBuilder { + self.expr.clone().filter(filter.expr.clone()).into() + } + + pub fn distinct(&self) -> PyExprFuncBuilder { + self.expr.clone().distinct().into() + } + + pub fn null_treatment(&self, null_treatment: NullTreatment) -> PyExprFuncBuilder { + self.expr + .clone() + .null_treatment(Some(null_treatment.into())) + .into() + } + + pub fn partition_by(&self, partition_by: Vec) -> PyExprFuncBuilder { + let partition_by = partition_by.iter().map(|e| e.expr.clone()).collect(); + self.expr.clone().partition_by(partition_by).into() + } + + pub fn window_frame(&self, window_frame: PyWindowFrame) -> PyExprFuncBuilder { + self.expr.clone().window_frame(window_frame.into()).into() + } +} + +#[pyclass(name = "ExprFuncBuilder", module = "datafusion.expr", subclass)] +#[derive(Debug, Clone)] +pub struct PyExprFuncBuilder { + pub builder: ExprFuncBuilder, +} + +impl From for PyExprFuncBuilder { + fn from(builder: ExprFuncBuilder) -> Self { + Self { builder } + } +} + +pub fn to_sort_expressions(order_by: Vec) -> Vec { + order_by + .iter() + .map(|e| e.expr.clone()) + .map(|e| match e { + Expr::Sort(_) => e, + _ => e.sort(true, true), + }) + .collect() +} + +#[pymethods] +impl PyExprFuncBuilder { + pub fn order_by(&self, order_by: Vec) -> PyExprFuncBuilder { + self.builder + .clone() + .order_by(to_sort_expressions(order_by)) + .into() + } + + pub fn filter(&self, filter: PyExpr) -> PyExprFuncBuilder { + self.builder.clone().filter(filter.expr.clone()).into() + } + + pub fn distinct(&self) -> PyExprFuncBuilder { + self.builder.clone().distinct().into() + } + + pub fn null_treatment(&self, null_treatment: NullTreatment) -> PyExprFuncBuilder { + self.builder + .clone() + .null_treatment(Some(null_treatment.into())) + .into() + } + + pub fn partition_by(&self, partition_by: Vec) -> PyExprFuncBuilder { + let partition_by = partition_by.iter().map(|e| e.expr.clone()).collect(); + self.builder.clone().partition_by(partition_by).into() + } + + pub fn window_frame(&self, window_frame: PyWindowFrame) -> PyExprFuncBuilder { + self.builder + .clone() + .window_frame(window_frame.into()) + .into() + } + + pub fn build(&self) -> PyResult { + self.builder + .clone() + .build() + .map(|expr| expr.into()) + .map_err(|err| err.into()) + } } impl PyExpr { diff --git a/src/expr/window.rs b/src/expr/window.rs index 786651194..7eb586082 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -168,7 +168,11 @@ fn not_window_function_err(expr: Expr) -> PyErr { impl PyWindowFrame { #[new] #[pyo3(signature=(unit, start_bound, end_bound))] - pub fn new(unit: &str, start_bound: Option, end_bound: Option) -> PyResult { + pub fn new( + unit: &str, + start_bound: Option, + end_bound: Option, + ) -> PyResult { let units = unit.to_ascii_lowercase(); let units = match units.as_str() { "rows" => WindowFrameUnits::Rows, @@ -182,9 +186,7 @@ impl PyWindowFrame { } }; let start_bound = match start_bound { - Some(start_bound) => { - WindowFrameBound::Preceding(ScalarValue::UInt64(Some(start_bound))) - } + Some(start_bound) => WindowFrameBound::Preceding(start_bound), None => match units { WindowFrameUnits::Range => WindowFrameBound::Preceding(ScalarValue::UInt64(None)), WindowFrameUnits::Rows => WindowFrameBound::Preceding(ScalarValue::UInt64(None)), @@ -197,7 +199,7 @@ impl PyWindowFrame { }, }; let end_bound = match end_bound { - Some(end_bound) => WindowFrameBound::Following(ScalarValue::UInt64(Some(end_bound))), + Some(end_bound) => WindowFrameBound::Following(end_bound), None => match units { WindowFrameUnits::Rows => WindowFrameBound::Following(ScalarValue::UInt64(None)), WindowFrameUnits::Range => WindowFrameBound::Following(ScalarValue::UInt64(None)), diff --git a/src/functions.rs b/src/functions.rs index 252563621..aed4de474 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -16,13 +16,16 @@ // under the License. use datafusion::functions_aggregate::all_default_aggregate_functions; +use datafusion_expr::window_function; use datafusion_expr::ExprFunctionExt; +use datafusion_expr::WindowFrame; use pyo3::{prelude::*, wrap_pyfunction}; use crate::common::data_type::NullTreatment; use crate::context::PySessionContext; use crate::errors::DataFusionError; use crate::expr::conditional_expr::PyCaseBuilder; +use crate::expr::to_sort_expressions; use crate::expr::window::PyWindowFrame; use crate::expr::PyExpr; use datafusion::execution::FunctionRegistry; @@ -316,18 +319,15 @@ pub fn regr_syy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult, order_by: Option>, null_treatment: Option, ) -> PyResult { - // If we initialize the UDAF with order_by directly, then it gets over-written by the builder - let agg_fn = functions_aggregate::expr_fn::first_value(expr.expr, None); - - // luckily, I can guarantee initializing a builder with an `order_by` default of empty vec + // Since ExprFuncBuilder::new() is private, we can guarantee initializing + // a builder with an `order_by` default of empty vec let order_by = order_by .map(|x| x.into_iter().map(|x| x.expr).collect::>()) .unwrap_or_default(); @@ -348,32 +348,30 @@ pub fn first_value( } #[pyfunction] -pub fn last_value( +pub fn first_value( expr: PyExpr, distinct: bool, filter: Option, order_by: Option>, null_treatment: Option, ) -> PyResult { - let agg_fn = functions_aggregate::expr_fn::last_value(vec![expr.expr]); - - // luckily, I can guarantee initializing a builder with an `order_by` default of empty vec - let order_by = order_by - .map(|x| x.into_iter().map(|x| x.expr).collect::>()) - .unwrap_or_default(); - let mut builder = agg_fn.order_by(order_by); - - if distinct { - builder = builder.distinct(); - } + // If we initialize the UDAF with order_by directly, then it gets over-written by the builder + let agg_fn = functions_aggregate::expr_fn::first_value(expr.expr, None); - if let Some(filter) = filter { - builder = builder.filter(filter.expr); - } + add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) +} - builder = builder.null_treatment(null_treatment.map(DFNullTreatment::from)); +#[pyfunction] +pub fn last_value( + expr: PyExpr, + distinct: bool, + filter: Option, + order_by: Option>, + null_treatment: Option, +) -> PyResult { + let agg_fn = functions_aggregate::expr_fn::last_value(vec![expr.expr]); - Ok(builder.build()?.into()) + add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) } #[pyfunction] @@ -618,9 +616,11 @@ fn window( ctx: Option, ) -> PyResult { let fun = find_window_fn(name, ctx)?; + let window_frame = window_frame - .unwrap_or_else(|| PyWindowFrame::new("rows", None, Some(0)).unwrap()) - .into(); + .map(|w| w.into()) + .unwrap_or(WindowFrame::new(order_by.as_ref().map(|v| !v.is_empty()))); + Ok(PyExpr { expr: datafusion_expr::Expr::WindowFunction(WindowFunction { fun, @@ -634,6 +634,10 @@ fn window( .unwrap_or_default() .into_iter() .map(|x| x.expr) + .map(|e| match e { + Expr::Sort(_) => e, + _ => e.sort(true, true), + }) .collect::>(), window_frame, null_treatment: None, @@ -890,6 +894,116 @@ aggregate_function!(array_agg, functions_aggregate::array_agg::array_agg_udaf); aggregate_function!(max, functions_aggregate::min_max::max_udaf); aggregate_function!(min, functions_aggregate::min_max::min_udaf); +fn add_builder_fns_to_window( + window_fn: Expr, + partition_by: Option>, + order_by: Option>, +) -> PyResult { + // Since ExprFuncBuilder::new() is private, set an empty partition and then + // override later if appropriate. + let mut builder = window_fn.partition_by(vec![]); + + if let Some(partition_cols) = partition_by { + builder = builder.partition_by( + partition_cols + .into_iter() + .map(|col| col.clone().into()) + .collect(), + ); + } + + if let Some(order_by_cols) = order_by { + let order_by_cols = to_sort_expressions(order_by_cols); + builder = builder.order_by(order_by_cols); + } + + builder.build().map(|e| e.into()).map_err(|err| err.into()) +} + +#[pyfunction] +pub fn lead( + arg: PyExpr, + shift_offset: i64, + default_value: Option, + partition_by: Option>, + order_by: Option>, +) -> PyResult { + let window_fn = window_function::lead(arg.expr, Some(shift_offset), default_value); + + add_builder_fns_to_window(window_fn, partition_by, order_by) +} + +#[pyfunction] +pub fn lag( + arg: PyExpr, + shift_offset: i64, + default_value: Option, + partition_by: Option>, + order_by: Option>, +) -> PyResult { + let window_fn = window_function::lag(arg.expr, Some(shift_offset), default_value); + + add_builder_fns_to_window(window_fn, partition_by, order_by) +} + +#[pyfunction] +pub fn row_number( + partition_by: Option>, + order_by: Option>, +) -> PyResult { + let window_fn = window_function::row_number(); + + add_builder_fns_to_window(window_fn, partition_by, order_by) +} + +#[pyfunction] +pub fn rank(partition_by: Option>, order_by: Option>) -> PyResult { + let window_fn = window_function::rank(); + + add_builder_fns_to_window(window_fn, partition_by, order_by) +} + +#[pyfunction] +pub fn dense_rank( + partition_by: Option>, + order_by: Option>, +) -> PyResult { + let window_fn = window_function::dense_rank(); + + add_builder_fns_to_window(window_fn, partition_by, order_by) +} + +#[pyfunction] +pub fn percent_rank( + partition_by: Option>, + order_by: Option>, +) -> PyResult { + let window_fn = window_function::percent_rank(); + + add_builder_fns_to_window(window_fn, partition_by, order_by) +} + +#[pyfunction] +pub fn cume_dist( + partition_by: Option>, + order_by: Option>, +) -> PyResult { + let window_fn = window_function::cume_dist(); + + add_builder_fns_to_window(window_fn, partition_by, order_by) +} + +#[pyfunction] +pub fn ntile( + arg: PyExpr, + partition_by: Option>, + order_by: Option>, +) -> PyResult { + let window_fn = window_function::ntile(arg.into()); + + add_builder_fns_to_window(window_fn, partition_by, order_by) +} + pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(abs))?; m.add_wrapped(wrap_pyfunction!(acos))?; @@ -1075,5 +1189,15 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(array_slice))?; m.add_wrapped(wrap_pyfunction!(flatten))?; + // Window Functions + m.add_wrapped(wrap_pyfunction!(lead))?; + m.add_wrapped(wrap_pyfunction!(lag))?; + m.add_wrapped(wrap_pyfunction!(row_number))?; + m.add_wrapped(wrap_pyfunction!(rank))?; + m.add_wrapped(wrap_pyfunction!(dense_rank))?; + m.add_wrapped(wrap_pyfunction!(percent_rank))?; + m.add_wrapped(wrap_pyfunction!(cume_dist))?; + m.add_wrapped(wrap_pyfunction!(ntile))?; + Ok(()) } From 57eb9596e0a3d6205bd3e94e83854111145572e6 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Mon, 2 Sep 2024 16:51:52 +0200 Subject: [PATCH 023/248] chore: fix typos (#844) - run [codespell](https://github.com/codespell-project/codespell) on the source code - change name of parameter in db-benchmark.dockerfile based on spelling suggestion and the documentation: https://www.rdocumentation.org/packages/utils/versions/3.6.2/topics/install.packages --- benchmarks/db-benchmark/db-benchmark.dockerfile | 2 +- docs/mdbook/src/index.md | 2 +- docs/source/_static/theme_overrides.css | 2 +- docs/source/conf.py | 2 +- .../user-guide/common-operations/expressions.rst | 2 +- examples/export.py | 2 +- examples/python-udf-comparisons.py | 2 +- examples/tpch/q02_minimum_cost_supplier.py | 2 +- examples/tpch/q04_order_priority_checking.py | 4 ++-- examples/tpch/q06_forecasting_revenue_change.py | 2 +- examples/tpch/q07_volume_shipping.py | 2 +- examples/tpch/q11_important_stock_identification.py | 2 +- examples/tpch/q15_top_supplier.py | 2 +- examples/tpch/q20_potential_part_promotion.py | 2 +- examples/tpch/q21_suppliers_kept_orders_waiting.py | 2 +- examples/tpch/q22_global_sales_opportunity.py | 4 ++-- python/datafusion/context.py | 4 ++-- python/datafusion/expr.py | 2 +- python/datafusion/functions.py | 10 +++++----- python/datafusion/input/location.py | 4 ++-- python/datafusion/udf.py | 4 ++-- src/common/data_type.rs | 2 +- src/expr/table_scan.rs | 2 +- 23 files changed, 32 insertions(+), 32 deletions(-) diff --git a/benchmarks/db-benchmark/db-benchmark.dockerfile b/benchmarks/db-benchmark/db-benchmark.dockerfile index d8842b250..af2edd0f4 100644 --- a/benchmarks/db-benchmark/db-benchmark.dockerfile +++ b/benchmarks/db-benchmark/db-benchmark.dockerfile @@ -58,7 +58,7 @@ RUN cd pandas && \ RUN cd modin && \ virtualenv py-modin --python=/usr/bin/python3.10 -RUN Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependecies=TRUE, repos="https://cloud.r-project.org")' +RUN Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependencies=TRUE, repos="https://cloud.r-project.org")' SHELL ["/bin/bash", "-c"] diff --git a/docs/mdbook/src/index.md b/docs/mdbook/src/index.md index 3cd0fec1d..2c1d217f8 100644 --- a/docs/mdbook/src/index.md +++ b/docs/mdbook/src/index.md @@ -18,7 +18,7 @@ DataFusion is a blazing fast query engine that lets you run data analyses quickly and reliably. -DataFusion is written in Rust, but also exposes Python and SQL bindings, so you can easily query data in your langauge of choice. You don't need to know any Rust to be a happy and productive user of DataFusion. +DataFusion is written in Rust, but also exposes Python and SQL bindings, so you can easily query data in your language of choice. You don't need to know any Rust to be a happy and productive user of DataFusion. DataFusion lets you run queries faster than pandas. Let's compare query runtimes for a 5GB CSV file with 100 million rows of data. diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index 1e972cc6f..aaa40fba2 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -56,7 +56,7 @@ a.navbar-brand img { /* This is the bootstrap CSS style for "table-striped". Since the theme does -not yet provide an easy way to configure this globaly, it easier to simply +not yet provide an easy way to configure this globally, it easier to simply include this snippet here than updating each table in all rst files to add ":class: table-striped" */ diff --git a/docs/source/conf.py b/docs/source/conf.py index d5084551e..2e5a41339 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -"""Documenation generation.""" +"""Documentation generation.""" # Configuration file for the Sphinx documentation builder. # diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index c8f8b8f29..e35234c32 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -21,7 +21,7 @@ Expressions =========== In DataFusion an expression is an abstraction that represents a computation. -Expressions are used as the primary inputs and ouputs for most functions within +Expressions are used as the primary inputs and outputs for most functions within DataFusion. As such, expressions can be combined to create expression trees, a concept shared across most compilers and databases. diff --git a/examples/export.py b/examples/export.py index d179bf39d..cc02de52b 100644 --- a/examples/export.py +++ b/examples/export.py @@ -48,6 +48,6 @@ pylist = df.to_pylist() assert pylist == [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}] -# export to Pyton dictionary of columns +# export to Python dictionary of columns pydict = df.to_pydict() assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6]} diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py index 5a6f548fb..9a84dd730 100644 --- a/examples/python-udf-comparisons.py +++ b/examples/python-udf-comparisons.py @@ -28,7 +28,7 @@ # question "return all of the rows that have a specific combination of these # values". We have the combinations we care about provided as a python # list of tuples. There is no built in function that supports this operation, -# but it can be explicilty specified via a single expression or we can +# but it can be explicitly specified via a single expression or we can # use a user defined function. ctx = SessionContext() diff --git a/examples/tpch/q02_minimum_cost_supplier.py b/examples/tpch/q02_minimum_cost_supplier.py index f4020d7bb..2171a2083 100644 --- a/examples/tpch/q02_minimum_cost_supplier.py +++ b/examples/tpch/q02_minimum_cost_supplier.py @@ -96,7 +96,7 @@ # create a column of that value. We can then filter down any rows for which the cost and # minimum do not match. -# The default window frame as of 5/6/2024 is from unbounded preceeding to the current row. +# The default window frame as of 5/6/2024 is from unbounded preceding to the current row. # We want to evaluate the entire data frame, so we specify this. window_frame = datafusion.WindowFrame("rows", None, None) df = df.with_column( diff --git a/examples/tpch/q04_order_priority_checking.py b/examples/tpch/q04_order_priority_checking.py index 9dbd81674..77c3bd43e 100644 --- a/examples/tpch/q04_order_priority_checking.py +++ b/examples/tpch/q04_order_priority_checking.py @@ -53,9 +53,9 @@ # Limit results to cases where commitment date before receipt date # Aggregate the results so we only get one row to join with the order table. -# Alterately, and likely more idomatic is instead of `.aggregate` you could +# Alternately, and likely more idiomatic is instead of `.aggregate` you could # do `.select_columns("l_orderkey").distinct()`. The goal here is to show -# mulitple examples of how to use Data Fusion. +# multiple examples of how to use Data Fusion. df_lineitem = df_lineitem.filter(col("l_commitdate") < col("l_receiptdate")).aggregate( [col("l_orderkey")], [] ) diff --git a/examples/tpch/q06_forecasting_revenue_change.py b/examples/tpch/q06_forecasting_revenue_change.py index ec98aaf5e..3beb9eb1f 100644 --- a/examples/tpch/q06_forecasting_revenue_change.py +++ b/examples/tpch/q06_forecasting_revenue_change.py @@ -82,5 +82,5 @@ revenue = df.collect()[0]["revenue"][0].as_py() -# Note: the output value from this query may be dependant on the size of the database generated +# Note: the output value from this query may be dependent on the size of the database generated print(f"Potential lost revenue: {revenue:.2f}") diff --git a/examples/tpch/q07_volume_shipping.py b/examples/tpch/q07_volume_shipping.py index fd7323b79..44c605a9b 100644 --- a/examples/tpch/q07_volume_shipping.py +++ b/examples/tpch/q07_volume_shipping.py @@ -77,7 +77,7 @@ # the two nations of interest. Since there is no `otherwise()` statement, any values that do # not match these will result in a null value and then get filtered out. # -# To do the same using a simle filter would be: +# To do the same using a simple filter would be: # df_nation = df_nation.filter((F.col("n_name") == nation_1) | (F.col("n_name") == nation_2)) df_nation = df_nation.with_column( "n_name", diff --git a/examples/tpch/q11_important_stock_identification.py b/examples/tpch/q11_important_stock_identification.py index 267248707..391eb45b1 100644 --- a/examples/tpch/q11_important_stock_identification.py +++ b/examples/tpch/q11_important_stock_identification.py @@ -63,7 +63,7 @@ # Compute total value of specific parts df = df.aggregate([col("ps_partkey")], [F.sum(col("value")).alias("value")]) -# By default window functions go from unbounded preceeding to current row, but we want +# By default window functions go from unbounded preceding to current row, but we want # to compute this sum across all rows window_frame = WindowFrame("rows", None, None) diff --git a/examples/tpch/q15_top_supplier.py b/examples/tpch/q15_top_supplier.py index 4b9e4c1dd..aa76093ec 100644 --- a/examples/tpch/q15_top_supplier.py +++ b/examples/tpch/q15_top_supplier.py @@ -78,7 +78,7 @@ # from the supplier table df = df.join(df_supplier, (["l_suppkey"], ["s_suppkey"]), "inner") -# Return only the colums requested +# Return only the columns requested df = df.select_columns("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue") # If we have more than one, sort by supplier number (suppkey) diff --git a/examples/tpch/q20_potential_part_promotion.py b/examples/tpch/q20_potential_part_promotion.py index 05a267450..4ced7aaa1 100644 --- a/examples/tpch/q20_potential_part_promotion.py +++ b/examples/tpch/q20_potential_part_promotion.py @@ -74,7 +74,7 @@ # This will filter down the line items to the parts of interest df = df.join(df_part, (["l_partkey"], ["p_partkey"]), "inner") -# Compute the total sold and limit ourselves to indivdual supplier/part combinations +# Compute the total sold and limit ourselves to individual supplier/part combinations df = df.aggregate( [col("l_partkey"), col("l_suppkey")], [F.sum(col("l_quantity")).alias("total_sold")] ) diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py b/examples/tpch/q21_suppliers_kept_orders_waiting.py index 9f59804e5..6b1679e7d 100644 --- a/examples/tpch/q21_suppliers_kept_orders_waiting.py +++ b/examples/tpch/q21_suppliers_kept_orders_waiting.py @@ -74,7 +74,7 @@ # only orders where this array is larger than one for multiple supplier orders. The second column # is all of the suppliers who failed to make their commitment. We can filter the second column for # arrays with size one. That combination will give us orders that had multiple suppliers where only -# one failed. Use distinct=True in the blow aggregation so we don't get multipe line items from the +# one failed. Use distinct=True in the blow aggregation so we don't get multiple line items from the # same supplier reported in either array. df = df.aggregate( [col("o_orderkey")], diff --git a/examples/tpch/q22_global_sales_opportunity.py b/examples/tpch/q22_global_sales_opportunity.py index 622c1429f..41fd5de9e 100644 --- a/examples/tpch/q22_global_sales_opportunity.py +++ b/examples/tpch/q22_global_sales_opportunity.py @@ -45,14 +45,14 @@ # The nation code is a two digit number, but we need to convert it to a string literal nation_codes = F.make_array(*[lit(str(n)) for n in NATION_CODES]) -# Use the substring operation to extract the first two charaters of the phone number +# Use the substring operation to extract the first two characters of the phone number df = df_customer.with_column("cntrycode", F.substring(col("c_phone"), lit(0), lit(3))) # Limit our search to customers with some balance and in the country code above df = df.filter(col("c_acctbal") > lit(0.0)) df = df.filter(~F.array_position(nation_codes, col("cntrycode")).is_null()) -# Compute the average balance. By default, the window frame is from unbounded preceeding to the +# Compute the average balance. By default, the window frame is from unbounded preceding to the # current row. We want our frame to cover the entire data frame. window_frame = WindowFrame("rows", None, None) df = df.with_column( diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 283f71e1e..f6bf1c6ce 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -436,7 +436,7 @@ def __init__( Example usage: - The following example demostrates how to use the context to execute + The following example demonstrates how to use the context to execute a query against a CSV data source using the :py:class:`DataFrame` API:: from datafusion import SessionContext @@ -853,7 +853,7 @@ def empty_table(self) -> DataFrame: return DataFrame(self.ctx.empty_table()) def session_id(self) -> str: - """Retrun an id that uniquely identifies this :py:class:`SessionContext`.""" + """Return an id that uniquely identifies this :py:class:`SessionContext`.""" return self.ctx.session_id() def read_json( diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index c7272bb3b..742f8e43d 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -515,7 +515,7 @@ def __init__( Args: units: Should be one of ``rows``, ``range``, or ``groups``. - start_bound: Sets the preceeding bound. Must be >= 0. If none, this + start_bound: Sets the preceding bound. Must be >= 0. If none, this will be set to unbounded. If unit type is ``groups``, this parameter must be set. end_bound: Sets the following bound. Must be >= 0. If none, this diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 28201c1d1..120fed819 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -342,7 +342,7 @@ def concat(*args: Expr) -> Expr: def concat_ws(separator: str, *args: Expr) -> Expr: """Concatenates the list ``args`` with the separator. - ``NULL`` arugments are ignored. ``separator`` should not be ``NULL``. + ``NULL`` arguments are ignored. ``separator`` should not be ``NULL``. """ args = [arg.expr for arg in args] return Expr(f.concat_ws(separator, args)) @@ -541,7 +541,7 @@ def ends_with(arg: Expr, suffix: Expr) -> Expr: def exp(arg: Expr) -> Expr: - """Returns the exponential of the arugment.""" + """Returns the exponential of the argument.""" return Expr(f.exp(arg.expr)) @@ -1593,7 +1593,7 @@ def grouping(arg: Expr, distinct: bool = False) -> Expr: def max(arg: Expr, distinct: bool = False) -> Expr: - """Returns the maximum value of the arugment.""" + """Returns the maximum value of the argument.""" return Expr(f.max(arg.expr, distinct=distinct)) @@ -1769,12 +1769,12 @@ def bit_xor(arg: Expr, distinct: bool = False) -> Expr: def bool_and(arg: Expr, distinct: bool = False) -> Expr: - """Computes the boolean AND of the arugment.""" + """Computes the boolean AND of the argument.""" return Expr(f.bool_and(arg.expr, distinct=distinct)) def bool_or(arg: Expr, distinct: bool = False) -> Expr: - """Computes the boolean OR of the arguement.""" + """Computes the boolean OR of the argument.""" return Expr(f.bool_or(arg.expr, distinct=distinct)) diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index 566a63da9..b274539fc 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -66,7 +66,7 @@ def build_table( # Consume header row and count number of rows for statistics. # TODO: Possibly makes sense to have the eager number of rows # calculated as a configuration since you must read the entire file - # to get that information. However, this should only be occuring + # to get that information. However, this should only be occurring # at table creation time and therefore shouldn't # slow down query performance. with open(input_file, "r") as file: @@ -75,7 +75,7 @@ def build_table( print(header_row) for _ in reader: num_rows += 1 - # TODO: Need to actually consume this row into resonable columns + # TODO: Need to actually consume this row into reasonable columns raise RuntimeError("TODO: Currently unable to support CSV input files.") else: raise RuntimeError( diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index bdbad661a..a3b74bb11 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -153,7 +153,7 @@ def state(self) -> List[pyarrow.Scalar]: @abstractmethod def update(self, values: pyarrow.Array) -> None: - """Evalute an array of values and update state.""" + """Evaluate an array of values and update state.""" pass @abstractmethod @@ -189,7 +189,7 @@ def __init__( ) -> None: """Instantiate a user defined aggregate function (UDAF). - See :py:func:`udaf` for a convenience function and arugment + See :py:func:`udaf` for a convenience function and argument descriptions. """ self._udf = df_internal.AggregateUDF( diff --git a/src/common/data_type.rs b/src/common/data_type.rs index 21b085c0e..bdfc5a6b1 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -40,7 +40,7 @@ pub enum RexType { /// Arrow types which represents the underlying arrow format /// Python types which represent the type in Python /// It is important to keep all of those types in a single -/// and managable location. Therefore this structure exists +/// and manageable location. Therefore this structure exists /// to map those types and provide a simple place for developers /// to map types from one system to another. #[derive(Debug, Clone)] diff --git a/src/expr/table_scan.rs b/src/expr/table_scan.rs index 8fafd12ee..386c43b6a 100644 --- a/src/expr/table_scan.rs +++ b/src/expr/table_scan.rs @@ -94,7 +94,7 @@ impl PyTableScan { /// The column indexes that should be. Note if this is empty then /// all columns should be read by the `TableProvider`. This function - /// provides a Tuple of the (index, column_name) to make things simplier + /// provides a Tuple of the (index, column_name) to make things simpler /// for the calling code since often times the name is preferred to /// the index which is a lower level abstraction. #[pyo3(name = "projection")] From e8ebc4fab68941b5fb9adadd3a39ec2c8ac120ce Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Mon, 2 Sep 2024 09:52:21 -0500 Subject: [PATCH 024/248] build(ci): use proper mac runners (#841) * build(ci): use proper mac runners GH updated the runner images. macos-14 is now ARM64. macos-13 is last x86_64. Closes https://github.com/apache/datafusion-python/issues/831 * update name for macos x86_64 build job * update manylinux x86_64 job name for consistency --- .github/workflows/build.yml | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a4f8b2da5..5fae13f2e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -117,10 +117,10 @@ jobs: name: dist-${{ matrix.os }} path: target/wheels/* - build-macos-aarch64: + build-macos-x86_64: needs: [generate-license] - name: Mac arm64 - runs-on: macos-latest + name: Mac x86_64 + runs-on: macos-13 strategy: fail-fast: false matrix: @@ -136,9 +136,6 @@ jobs: with: toolchain: stable - - name: Set up Rust targets - run: rustup target add aarch64-apple-darwin - - name: Upgrade pip run: python -m pip install --upgrade pip @@ -159,7 +156,7 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Build Python package - run: maturin build --release --strip --target aarch64-apple-darwin --features substrait + run: maturin build --release --strip --features substrait - name: List Mac wheels run: find target/wheels/ @@ -171,7 +168,7 @@ jobs: build-manylinux-x86_64: needs: [generate-license] - name: Manylinux + name: Manylinux x86_64 runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -262,7 +259,7 @@ jobs: runs-on: ubuntu-latest needs: - build-python-mac-win - - build-macos-aarch64 + - build-macos-x86_64 - build-manylinux-x86_64 - build-manylinux-aarch64 - build-sdist From 909b809b7a047d470ce5d187c5e9dd734c9d3163 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 2 Sep 2024 11:27:44 -0400 Subject: [PATCH 025/248] Set of small features (#839) * Add repr_html to give nice displays in notebooks when using display(df) * Allow get_item to get index of an array or a field in a struct * add test for getting array elements * Small typo in array * Add DataFrame transform * Update index in unit test * Add dataframe transform unit test * Add unit test for repr_html * Updating documentation * fix typo --------- Co-authored-by: Andy Grove --- .../common-operations/expressions.rst | 37 +++++++++++++++ python/datafusion/dataframe.py | 26 +++++++++++ python/datafusion/expr.py | 19 ++++++-- python/datafusion/functions.py | 2 +- python/datafusion/tests/test_dataframe.py | 32 +++++++++++++ python/datafusion/tests/test_expr.py | 23 ++++++++++ src/dataframe.rs | 46 +++++++++++++++++++ 7 files changed, 181 insertions(+), 4 deletions(-) diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index e35234c32..6014c9d2e 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -60,6 +60,43 @@ examples for the and, or, and not operations. heavy_red_units = (col("color") == lit("red")) & (col("weight") > lit(42)) not_red_units = ~(col("color") == lit("red")) +Arrays +------ + +For columns that contain arrays of values, you can access individual elements of the array by index +using bracket indexing. This is similar to callling the function +:py:func:`datafusion.functions.array_element`, except that array indexing using brackets is 0 based, +similar to Python arrays and ``array_element`` is 1 based indexing to be compatible with other SQL +approaches. + +.. ipython:: python + + from datafusion import SessionContext, col + + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5, 6]]}) + df.select(col("a")[0].alias("a0")) + + +.. warning:: + + Indexing an element of an array via ``[]`` starts at index 0 whereas + :py:func:`~datafusion.functions.array_element` starts at index 1. + +Structs +------- + +Columns that contain struct elements can be accessed using the bracket notation as if they were +Python dictionary style objects. This expects a string key as the parameter passed. + +.. ipython:: python + + ctx = SessionContext() + data = {"a": [{"size": 15, "color": "green"}, {"size": 10, "color": "blue"}]} + df = ctx.from_pydict(data) + df.select(col("a")["size"].alias("a_size")) + + Functions --------- diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 0e7d82e29..46b8fa1bd 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -30,6 +30,7 @@ import pandas as pd import polars as pl import pathlib + from typing import Callable from datafusion._internal import DataFrame as DataFrameInternal from datafusion.expr import Expr @@ -72,6 +73,9 @@ def __repr__(self) -> str: """ return self.df.__repr__() + def _repr_html_(self) -> str: + return self.df._repr_html_() + def describe(self) -> DataFrame: """Return the statistics for this DataFrame. @@ -539,3 +543,25 @@ def __arrow_c_stream__(self, requested_schema: pa.Schema) -> Any: Arrow PyCapsule object. """ return self.df.__arrow_c_stream__(requested_schema) + + def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame: + """Apply a function to the current DataFrame which returns another DataFrame. + + This is useful for chaining together multiple functions. For example:: + + def add_3(df: DataFrame) -> DataFrame: + return df.with_column("modified", lit(3)) + + def within_limit(df: DataFrame, limit: int) -> DataFrame: + return df.filter(col("a") < lit(limit)).distinct() + + df = df.transform(modify_df).transform(within_limit, 4) + + Args: + func: A callable function that takes a DataFrame as it's first argument + args: Zero or more arguments to pass to `func` + + Returns: + DataFrame: After applying func to the original dataframe. + """ + return func(self, *args) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 742f8e43d..7bea0289b 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -22,7 +22,11 @@ from __future__ import annotations -from ._internal import expr as expr_internal, LogicalPlan +from ._internal import ( + expr as expr_internal, + LogicalPlan, + functions as functions_internal, +) from datafusion.common import NullTreatment, RexType, DataTypeMap from typing import Any, Optional import pyarrow as pa @@ -257,8 +261,17 @@ def __invert__(self) -> Expr: """Binary not (~).""" return Expr(self.expr.__invert__()) - def __getitem__(self, key: str) -> Expr: - """For struct data types, return the field indicated by ``key``.""" + def __getitem__(self, key: str | int) -> Expr: + """Retrieve sub-object. + + If ``key`` is a string, returns the subfield of the struct. + If ``key`` is an integer, retrieves the element in the array. Note that the + element index begins at ``0``, unlike `array_element` which begines at ``1``. + """ + if isinstance(key, int): + return Expr( + functions_internal.array_element(self.expr, Expr.literal(key + 1).expr) + ) return Expr(self.expr.__getitem__(key)) def __eq__(self, rhs: Any) -> Expr: diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 120fed819..4c701b24d 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1023,7 +1023,7 @@ def array(*args: Expr) -> Expr: This is an alias for :py:func:`make_array`. """ - return make_array(args) + return make_array(*args) def range(start: Expr, stop: Expr, step: Expr) -> Expr: diff --git a/python/datafusion/tests/test_dataframe.py b/python/datafusion/tests/test_dataframe.py index c2a5f22ba..90954d09a 100644 --- a/python/datafusion/tests/test_dataframe.py +++ b/python/datafusion/tests/test_dataframe.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. import os +from typing import Any import pyarrow as pa from pyarrow.csv import write_csv @@ -970,3 +971,34 @@ def test_dataframe_export(df) -> None: except Exception: failed_convert = True assert failed_convert + + +def test_dataframe_transform(df): + def add_string_col(df_internal) -> DataFrame: + return df_internal.with_column("string_col", literal("string data")) + + def add_with_parameter(df_internal, value: Any) -> DataFrame: + return df_internal.with_column("new_col", literal(value)) + + df = df.transform(add_string_col).transform(add_with_parameter, 3) + + result = df.to_pydict() + + assert result["a"] == [1, 2, 3] + assert result["string_col"] == ["string data" for _i in range(0, 3)] + assert result["new_col"] == [3 for _i in range(0, 3)] + + +def test_dataframe_repr_html(df) -> None: + output = df._repr_html_() + + ref_html = """ + + + + +
abc
148
255
368
+ """ + + # Ignore whitespace just to make this test look cleaner + assert output.replace(" ", "") == ref_html.replace(" ", "") diff --git a/python/datafusion/tests/test_expr.py b/python/datafusion/tests/test_expr.py index 9071108cb..056d2ea03 100644 --- a/python/datafusion/tests/test_expr.py +++ b/python/datafusion/tests/test_expr.py @@ -169,3 +169,26 @@ def traverse_logical_plan(plan): == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' ) assert not variant.negated() + + +def test_expr_getitem() -> None: + ctx = SessionContext() + data = { + "array_values": [[1, 2, 3], [4, 5], [6], []], + "struct_values": [ + {"name": "Alice", "age": 15}, + {"name": "Bob", "age": 14}, + {"name": "Charlie", "age": 13}, + {"name": None, "age": 12}, + ], + } + df = ctx.from_pydict(data, name="table1") + + names = df.select(col("struct_values")["name"].alias("name")).collect() + names = [r.as_py() for rs in names for r in rs["name"]] + + array_values = df.select(col("array_values")[1].alias("value")).collect() + array_values = [r.as_py() for rs in array_values for r in rs["value"]] + + assert names == ["Alice", "Bob", "Charlie", None] + assert array_values == [2, 5, None, None] diff --git a/src/dataframe.rs b/src/dataframe.rs index d7abab400..3fb8b2292 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -23,6 +23,7 @@ use arrow::compute::can_cast_types; use arrow::error::ArrowError; use arrow::ffi::FFI_ArrowSchema; use arrow::ffi_stream::FFI_ArrowArrayStream; +use arrow::util::display::{ArrayFormatter, FormatOptions}; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; @@ -95,6 +96,51 @@ impl PyDataFrame { } } + fn _repr_html_(&self, py: Python) -> PyResult { + let mut html_str = "\n".to_string(); + + let df = self.df.as_ref().clone().limit(0, Some(10))?; + let batches = wait_for_future(py, df.collect())?; + + if batches.is_empty() { + html_str.push_str("
\n"); + return Ok(html_str); + } + + let schema = batches[0].schema(); + + let mut header = Vec::new(); + for field in schema.fields() { + header.push(format!("{}", field.name())); + } + let header_str = header.join(""); + html_str.push_str(&format!("{}\n", header_str)); + + for batch in batches { + let formatters = batch + .columns() + .iter() + .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) + .map(|c| { + c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) + }) + .collect::, _>>()?; + + for row in 0..batch.num_rows() { + let mut cells = Vec::new(); + for formatter in &formatters { + cells.push(format!("{}", formatter.value(row))); + } + let row_str = cells.join(""); + html_str.push_str(&format!("{}\n", row_str)); + } + } + + html_str.push_str("\n"); + + Ok(html_str) + } + /// Calculate summary statistics for a DataFrame fn describe(&self, py: Python) -> PyResult { let df = self.df.as_ref().clone(); From fe0738a9c0b536cdf20b0dc0455d14a0d16d2835 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Mon, 2 Sep 2024 17:28:59 +0200 Subject: [PATCH 026/248] feat: better exception and message for table not found (#851) closes #796 --- python/datafusion/tests/test_context.py | 7 +++++++ src/context.rs | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/python/datafusion/tests/test_context.py b/python/datafusion/tests/test_context.py index 0184280c2..4af00a3b4 100644 --- a/python/datafusion/tests/test_context.py +++ b/python/datafusion/tests/test_context.py @@ -466,6 +466,13 @@ def test_table_exist(ctx): assert ctx.table_exist("t") is True +def test_table_not_found(ctx): + from uuid import uuid4 + + with pytest.raises(KeyError): + ctx.table(f"not-found-{uuid4()}") + + def test_read_json(ctx): path = os.path.dirname(os.path.abspath(__file__)) diff --git a/src/context.rs b/src/context.rs index 4433d94c2..3ab783495 100644 --- a/src/context.rs +++ b/src/context.rs @@ -765,7 +765,8 @@ impl PySessionContext { } pub fn table(&self, name: &str, py: Python) -> PyResult { - let x = wait_for_future(py, self.ctx.table(name)).map_err(DataFusionError::from)?; + let x = wait_for_future(py, self.ctx.table(name)) + .map_err(|e| PyKeyError::new_err(e.to_string()))?; Ok(PyDataFrame::new(x)) } From 859acb4c582eb6ab0147843ace748a93c359264f Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Fri, 6 Sep 2024 17:28:53 +0200 Subject: [PATCH 027/248] feat: make cast accept built-in Python types (#858) --- python/datafusion/expr.py | 21 +++++++++++-- python/datafusion/tests/test_functions.py | 37 +++++++++++++++++------ 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 7bea0289b..a58634b53 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -28,7 +28,7 @@ functions as functions_internal, ) from datafusion.common import NullTreatment, RexType, DataTypeMap -from typing import Any, Optional +from typing import Any, Optional, Type import pyarrow as pa # The following are imported from the internal representation. We may choose to @@ -372,8 +372,25 @@ def is_not_null(self) -> Expr: """Returns ``True`` if this expression is not null.""" return Expr(self.expr.is_not_null()) - def cast(self, to: pa.DataType[Any]) -> Expr: + _to_pyarrow_types = { + float: pa.float64(), + int: pa.int64(), + str: pa.string(), + bool: pa.bool_(), + } + + def cast( + self, to: pa.DataType[Any] | Type[float] | Type[int] | Type[str] | Type[bool] + ) -> Expr: """Cast to a new data type.""" + if not isinstance(to, pa.DataType): + try: + to = self._to_pyarrow_types[to] + except KeyError: + raise TypeError( + "Expected instance of pyarrow.DataType or builtins.type" + ) + return Expr(self.expr.cast(to)) def rex_type(self) -> RexType: diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index fe092c456..e7e6d79e1 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -44,8 +44,9 @@ def df(): datetime(2020, 7, 2), ] ), + pa.array([False, True, True]), ], - names=["a", "b", "c", "d"], + names=["a", "b", "c", "d", "e"], ) return ctx.create_dataframe([[batch]]) @@ -63,15 +64,14 @@ def test_named_struct(df): ) expected = """DataFrame() -+-------+---+---------+------------------------------+ -| a | b | c | d | -+-------+---+---------+------------------------------+ -| Hello | 4 | hello | {a: Hello, b: 4, c: hello } | -| World | 5 | world | {a: World, b: 5, c: world } | -| ! | 6 | ! | {a: !, b: 6, c: !} | -+-------+---+---------+------------------------------+ ++-------+---+---------+------------------------------+-------+ +| a | b | c | d | e | ++-------+---+---------+------------------------------+-------+ +| Hello | 4 | hello | {a: Hello, b: 4, c: hello } | false | +| World | 5 | world | {a: World, b: 5, c: world } | true | +| ! | 6 | ! | {a: !, b: 6, c: !} | true | ++-------+---+---------+------------------------------+-------+ """.strip() - assert str(df) == expected @@ -978,3 +978,22 @@ def test_binary_string_functions(df): assert pa.array(result.column(1)).cast(pa.string()) == pa.array( ["Hello", "World", "!"] ) + + +@pytest.mark.parametrize( + "python_datatype, name, expected", + [ + pytest.param(bool, "e", pa.bool_(), id="bool"), + pytest.param(int, "b", pa.int64(), id="int"), + pytest.param(float, "b", pa.float64(), id="float"), + pytest.param(str, "b", pa.string(), id="str"), + ], +) +def test_cast(df, python_datatype, name: str, expected): + df = df.select( + column(name).cast(python_datatype).alias("actual"), + column(name).cast(expected).alias("expected"), + ) + result = df.collect() + result = result[0] + assert result.column(0) == result.column(1) From a0913c728f5f323c1eb4913e614c9d996083e274 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Fri, 6 Sep 2024 17:30:04 +0200 Subject: [PATCH 028/248] chore: fix docstrings, typos (#852) --- python/datafusion/context.py | 12 +++++++----- python/datafusion/expr.py | 2 +- python/datafusion/functions.py | 5 ----- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index f6bf1c6ce..903d4a107 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -73,7 +73,7 @@ def with_create_default_catalog_and_schema( def with_default_catalog_and_schema( self, catalog: str, schema: str ) -> SessionConfig: - """Select a name for the default catalog and shcema. + """Select a name for the default catalog and schema. Args: catalog: Catalog name. @@ -517,7 +517,7 @@ def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from SQL query text. - This function will first validating that the query is allowed by the + This function will first validate that the query is allowed by the provided options. Args: @@ -636,12 +636,14 @@ def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFr """ return DataFrame(self.ctx.from_polars(data, name)) - def register_table(self, name: str, table: pyarrow.Table) -> None: - """Register a table with the given name into the session. + def register_table(self, name: str, table: Table) -> None: + """Register a :py:class: `~datafusion.catalog.Table` as a table. + + The registered table can be referenced from SQL statement executed against. Args: name: Name of the resultant table. - table: PyArrow table to add to the session context. + table: DataFusion table to add to the session context. """ self.ctx.register_table(name, table) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index a58634b53..7fa608037 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -266,7 +266,7 @@ def __getitem__(self, key: str | int) -> Expr: If ``key`` is a string, returns the subfield of the struct. If ``key`` is an integer, retrieves the element in the array. Note that the - element index begins at ``0``, unlike `array_element` which begines at ``1``. + element index begins at ``0``, unlike `array_element` which begins at ``1``. """ if isinstance(key, int): return Expr( diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 4c701b24d..97b4fe1d5 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,11 +18,6 @@ from __future__ import annotations -# from datafusion._internal.context import SessionContext -# from datafusion._internal.expr import Expr -# from datafusion._internal.expr.conditional_expr import CaseBuilder -# from datafusion._internal.expr.window import WindowFrame - from datafusion._internal import functions as f, common from datafusion.expr import CaseBuilder, Expr, WindowFrame from datafusion.context import SessionContext From 6154ef983faeb01da98e9fcd6755b89fdef3d9ea Mon Sep 17 00:00:00 2001 From: Matt Green Date: Fri, 6 Sep 2024 18:10:10 -0700 Subject: [PATCH 029/248] chore: Use datafusion re-exported dependencies (#856) * Use datafusion re-exported crates * update dependency export syntax --- .gitignore | 1 + Cargo.lock | 5 ---- Cargo.toml | 7 +----- src/common/data_type.rs | 4 ++-- src/common/df_schema.rs | 2 +- src/common/schema.rs | 14 +++++------ src/config.rs | 2 +- src/context.rs | 6 ++--- src/dataframe.rs | 6 ++--- src/dataset.rs | 2 +- src/dataset_exec.rs | 4 ++-- src/expr.rs | 8 +++---- src/expr/aggregate.rs | 8 +++---- src/expr/aggregate_expr.rs | 2 +- src/expr/alias.rs | 2 +- src/expr/analyze.rs | 2 +- src/expr/between.rs | 2 +- src/expr/binary_expr.rs | 2 +- src/expr/bool_expr.rs | 2 +- src/expr/case.rs | 2 +- src/expr/cast.rs | 2 +- src/expr/column.rs | 2 +- src/expr/conditional_expr.rs | 2 +- src/expr/create_memory_table.rs | 2 +- src/expr/create_view.rs | 2 +- src/expr/cross_join.rs | 2 +- src/expr/distinct.rs | 2 +- src/expr/drop_table.rs | 2 +- src/expr/empty_relation.rs | 2 +- src/expr/exists.rs | 2 +- src/expr/explain.rs | 2 +- src/expr/extension.rs | 2 +- src/expr/filter.rs | 2 +- src/expr/grouping_set.rs | 2 +- src/expr/in_list.rs | 2 +- src/expr/in_subquery.rs | 2 +- src/expr/indexed_field.rs | 2 +- src/expr/join.rs | 2 +- src/expr/like.rs | 2 +- src/expr/limit.rs | 2 +- src/expr/literal.rs | 2 +- src/expr/placeholder.rs | 2 +- src/expr/projection.rs | 4 ++-- src/expr/repartition.rs | 2 +- src/expr/scalar_subquery.rs | 2 +- src/expr/signature.rs | 2 +- src/expr/sort.rs | 4 ++-- src/expr/sort_expr.rs | 2 +- src/expr/subquery.rs | 2 +- src/expr/subquery_alias.rs | 2 +- src/expr/table_scan.rs | 4 ++-- src/expr/union.rs | 2 +- src/expr/unnest.rs | 2 +- src/expr/unnest_expr.rs | 2 +- src/expr/window.rs | 6 ++--- src/functions.rs | 41 ++++++++++++++++---------------- src/lib.rs | 9 +++---- src/pyarrow_filter_expression.rs | 4 ++-- src/sql/logical.rs | 2 +- src/udaf.rs | 4 +++- src/udf.rs | 6 ++--- src/utils.rs | 2 +- 62 files changed, 113 insertions(+), 118 deletions(-) diff --git a/.gitignore b/.gitignore index aaeaaa5b1..614d82327 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ dist # intended to run in multiple environments; otherwise, check them in: .python-version venv +.venv apache-rat-*.jar *rat.txt diff --git a/Cargo.lock b/Cargo.lock index bca4bf066..43ee3055d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1072,11 +1072,6 @@ dependencies = [ "arrow", "async-trait", "datafusion", - "datafusion-common", - "datafusion-expr", - "datafusion-functions-nested", - "datafusion-optimizer", - "datafusion-sql", "datafusion-substrait", "futures", "mimalloc", diff --git a/Cargo.toml b/Cargo.toml index 8881884b4..e9a4ababb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,11 +39,6 @@ rand = "0.8" pyo3 = { version = "0.21", features = ["extension-module", "abi3", "abi3-py38"] } arrow = { version = "52", feature = ["pyarrow"] } datafusion = { version = "41.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } -datafusion-common = { version = "41.0.0", features = ["pyarrow"] } -datafusion-expr = { version = "41.0.0" } -datafusion-functions-nested = { version = "41.0.0" } -datafusion-optimizer = { version = "41.0.0" } -datafusion-sql = { version = "41.0.0" } datafusion-substrait = { version = "41.0.0", optional = true } prost = "0.12" # keep in line with `datafusion-substrait` prost-types = "0.12" # keep in line with `datafusion-substrait` @@ -67,4 +62,4 @@ crate-type = ["cdylib", "rlib"] [profile.release] lto = true codegen-units = 1 - \ No newline at end of file + diff --git a/src/common/data_type.rs b/src/common/data_type.rs index bdfc5a6b1..a29d1799c 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -17,8 +17,8 @@ use datafusion::arrow::array::Array; use datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; -use datafusion_common::{DataFusionError, ScalarValue}; -use datafusion_expr::sqlparser::ast::NullTreatment as DFNullTreatment; +use datafusion::common::{DataFusionError, ScalarValue}; +use datafusion::logical_expr::sqlparser::ast::NullTreatment as DFNullTreatment; use pyo3::{exceptions::PyValueError, prelude::*}; use crate::errors::py_datafusion_err; diff --git a/src/common/df_schema.rs b/src/common/df_schema.rs index c16b8eba0..4e1d84060 100644 --- a/src/common/df_schema.rs +++ b/src/common/df_schema.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use datafusion_common::DFSchema; +use datafusion::common::DFSchema; use pyo3::prelude::*; #[derive(Debug, Clone)] diff --git a/src/common/schema.rs b/src/common/schema.rs index 00113a510..5806c90e2 100644 --- a/src/common/schema.rs +++ b/src/common/schema.rs @@ -18,10 +18,10 @@ use std::any::Any; use datafusion::arrow::datatypes::SchemaRef; -use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource}; +use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableSource}; use pyo3::prelude::*; -use datafusion_expr::utils::split_conjunction; +use datafusion::logical_expr::utils::split_conjunction; use super::{data_type::DataTypeMap, function::SqlFunction}; @@ -166,7 +166,7 @@ impl TableSource for SqlTableSource { fn supports_filter_pushdown( &self, filter: &Expr, - ) -> datafusion_common::Result { + ) -> datafusion::common::Result { let filters = split_conjunction(filter); if filters.iter().all(|f| is_supported_push_down_expr(f)) { // Push down filters to the tablescan operation if all are supported @@ -180,22 +180,22 @@ impl TableSource for SqlTableSource { } } - fn table_type(&self) -> datafusion_expr::TableType { - datafusion_expr::TableType::Base + fn table_type(&self) -> datafusion::logical_expr::TableType { + datafusion::logical_expr::TableType::Base } #[allow(deprecated)] fn supports_filters_pushdown( &self, filters: &[&Expr], - ) -> datafusion_common::Result> { + ) -> datafusion::common::Result> { filters .iter() .map(|f| self.supports_filter_pushdown(f)) .collect() } - fn get_logical_plan(&self) -> Option<&datafusion_expr::LogicalPlan> { + fn get_logical_plan(&self) -> Option<&datafusion::logical_expr::LogicalPlan> { None } } diff --git a/src/config.rs b/src/config.rs index 82a4f93ab..3f2a05580 100644 --- a/src/config.rs +++ b/src/config.rs @@ -18,8 +18,8 @@ use pyo3::prelude::*; use pyo3::types::*; +use datafusion::common::ScalarValue; use datafusion::config::ConfigOptions; -use datafusion_common::ScalarValue; #[pyclass(name = "Config", module = "datafusion", subclass)] #[derive(Clone)] diff --git a/src/context.rs b/src/context.rs index 3ab783495..11b9fed5f 100644 --- a/src/context.rs +++ b/src/context.rs @@ -46,6 +46,7 @@ use crate::utils::{get_tokio_runtime, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::common::ScalarValue; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ @@ -61,7 +62,6 @@ use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, }; -use datafusion_common::ScalarValue; use pyo3::types::{PyDict, PyList, PyTuple}; use tokio::task::JoinHandle; @@ -962,7 +962,7 @@ impl PySessionContext { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime(py).0; let plan = plan.plan.clone(); - let fut: JoinHandle> = + let fut: JoinHandle> = rt.spawn(async move { plan.execute(part, Arc::new(ctx)) }); let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?; Ok(PyRecordBatchStream::new(stream?)) @@ -970,7 +970,7 @@ impl PySessionContext { } impl PySessionContext { - async fn _table(&self, name: &str) -> datafusion_common::Result { + async fn _table(&self, name: &str) -> datafusion::common::Result { self.ctx.table(name).await } } diff --git a/src/dataframe.rs b/src/dataframe.rs index 3fb8b2292..f33622cc0 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -27,12 +27,12 @@ use arrow::util::display::{ArrayFormatter, FormatOptions}; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; +use datafusion::common::UnnestOptions; use datafusion::config::{CsvOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; -use datafusion_common::UnnestOptions; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; @@ -541,7 +541,7 @@ impl PyDataFrame { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime(py).0; let df = self.df.as_ref().clone(); - let fut: JoinHandle> = + let fut: JoinHandle> = rt.spawn(async move { df.execute_stream().await }); let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?; Ok(PyRecordBatchStream::new(stream?)) @@ -551,7 +551,7 @@ impl PyDataFrame { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime(py).0; let df = self.df.as_ref().clone(); - let fut: JoinHandle>> = + let fut: JoinHandle>> = rt.spawn(async move { df.execute_stream_partitioned().await }); let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?; diff --git a/src/dataset.rs b/src/dataset.rs index b5704164f..de7402fd6 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -31,9 +31,9 @@ use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::logical_expr::Expr; use datafusion::logical_expr::TableProviderFilterPushDown; use datafusion::physical_plan::ExecutionPlan; -use datafusion_expr::Expr; use crate::dataset_exec::DatasetExec; use crate::pyarrow_filter_expression::PyArrowFilterExpression; diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index 5fe1f4d1b..a377e2555 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -32,14 +32,14 @@ use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError as InnerDataFusionError, Result as DFResult}; use datafusion::execution::context::TaskContext; +use datafusion::logical_expr::utils::conjunction; +use datafusion::logical_expr::Expr; use datafusion::physical_expr::{EquivalenceProperties, PhysicalSortExpr}; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, ExecutionPlanProperties, Partitioning, SendableRecordBatchStream, Statistics, }; -use datafusion_expr::utils::conjunction; -use datafusion_expr::Expr; use crate::errors::DataFusionError; use crate::pyarrow_filter_expression::PyArrowFilterExpression; diff --git a/src/expr.rs b/src/expr.rs index 697682d4c..ab16f2872 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::utils::exprlist_to_fields; -use datafusion_expr::{ExprFuncBuilder, ExprFunctionExt, LogicalPlan}; +use datafusion::logical_expr::utils::exprlist_to_fields; +use datafusion::logical_expr::{ExprFuncBuilder, ExprFunctionExt, LogicalPlan}; use pyo3::{basic::CompareOp, prelude::*}; use std::convert::{From, Into}; use std::sync::Arc; @@ -26,12 +26,12 @@ use arrow::pyarrow::ToPyArrow; use datafusion::arrow::datatypes::{DataType, Field}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::functions::core::expr_ext::FieldAccessor; -use datafusion::scalar::ScalarValue; -use datafusion_expr::{ +use datafusion::logical_expr::{ col, expr::{AggregateFunction, InList, InSubquery, ScalarFunction, Sort, WindowFunction}, lit, Between, BinaryExpr, Case, Cast, Expr, Like, Operator, TryCast, }; +use datafusion::scalar::ScalarValue; use crate::common::data_type::{DataTypeMap, NullTreatment, RexType}; use crate::errors::{py_runtime_err, py_type_err, py_unsupported_variant_err, DataFusionError}; diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index e3d1bb136..389bfb332 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::DataFusionError; -use datafusion_expr::expr::{AggregateFunction, Alias}; -use datafusion_expr::logical_plan::Aggregate; -use datafusion_expr::Expr; +use datafusion::common::DataFusionError; +use datafusion::logical_expr::expr::{AggregateFunction, Alias}; +use datafusion::logical_expr::logical_plan::Aggregate; +use datafusion::logical_expr::Expr; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/aggregate_expr.rs b/src/expr/aggregate_expr.rs index 15097e007..09471097f 100644 --- a/src/expr/aggregate_expr.rs +++ b/src/expr/aggregate_expr.rs @@ -16,7 +16,7 @@ // under the License. use crate::expr::PyExpr; -use datafusion_expr::expr::AggregateFunction; +use datafusion::logical_expr::expr::AggregateFunction; use pyo3::prelude::*; use std::fmt::{Display, Formatter}; diff --git a/src/expr/alias.rs b/src/expr/alias.rs index 3208800ad..e8e03cfad 100644 --- a/src/expr/alias.rs +++ b/src/expr/alias.rs @@ -19,7 +19,7 @@ use crate::expr::PyExpr; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; -use datafusion_expr::expr::Alias; +use datafusion::logical_expr::expr::Alias; #[pyclass(name = "Alias", module = "datafusion.expr", subclass)] #[derive(Clone)] diff --git a/src/expr/analyze.rs b/src/expr/analyze.rs index bbec3a808..084513971 100644 --- a/src/expr/analyze.rs +++ b/src/expr/analyze.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::logical_plan::Analyze; +use datafusion::logical_expr::logical_plan::Analyze; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/between.rs b/src/expr/between.rs index 9b78b9eeb..a2cac1442 100644 --- a/src/expr/between.rs +++ b/src/expr/between.rs @@ -16,7 +16,7 @@ // under the License. use crate::expr::PyExpr; -use datafusion_expr::expr::Between; +use datafusion::logical_expr::expr::Between; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/binary_expr.rs b/src/expr/binary_expr.rs index 5f382b770..740299211 100644 --- a/src/expr/binary_expr.rs +++ b/src/expr/binary_expr.rs @@ -16,7 +16,7 @@ // under the License. use crate::expr::PyExpr; -use datafusion_expr::BinaryExpr; +use datafusion::logical_expr::BinaryExpr; use pyo3::prelude::*; #[pyclass(name = "BinaryExpr", module = "datafusion.expr", subclass)] diff --git a/src/expr/bool_expr.rs b/src/expr/bool_expr.rs index d1502a4eb..e67e25d74 100644 --- a/src/expr/bool_expr.rs +++ b/src/expr/bool_expr.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::Expr; +use datafusion::logical_expr::Expr; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/case.rs b/src/expr/case.rs index 605275376..92e28ba56 100644 --- a/src/expr/case.rs +++ b/src/expr/case.rs @@ -16,7 +16,7 @@ // under the License. use crate::expr::PyExpr; -use datafusion_expr::Case; +use datafusion::logical_expr::Case; use pyo3::prelude::*; #[pyclass(name = "Case", module = "datafusion.expr", subclass)] diff --git a/src/expr/cast.rs b/src/expr/cast.rs index a72199876..b8faea634 100644 --- a/src/expr/cast.rs +++ b/src/expr/cast.rs @@ -16,7 +16,7 @@ // under the License. use crate::{common::data_type::PyDataType, expr::PyExpr}; -use datafusion_expr::{Cast, TryCast}; +use datafusion::logical_expr::{Cast, TryCast}; use pyo3::prelude::*; #[pyclass(name = "Cast", module = "datafusion.expr", subclass)] diff --git a/src/expr/column.rs b/src/expr/column.rs index 68123fb04..365dbc0d2 100644 --- a/src/expr/column.rs +++ b/src/expr/column.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::Column; +use datafusion::common::Column; use pyo3::prelude::*; #[pyclass(name = "Column", module = "datafusion.expr", subclass)] diff --git a/src/expr/conditional_expr.rs b/src/expr/conditional_expr.rs index 96ef58f56..a8a885c54 100644 --- a/src/expr/conditional_expr.rs +++ b/src/expr/conditional_expr.rs @@ -16,7 +16,7 @@ // under the License. use crate::expr::PyExpr; -use datafusion_expr::conditional_expressions::CaseBuilder; +use datafusion::logical_expr::conditional_expressions::CaseBuilder; use pyo3::prelude::*; #[pyclass(name = "CaseBuilder", module = "datafusion.expr", subclass)] diff --git a/src/expr/create_memory_table.rs b/src/expr/create_memory_table.rs index 509bf2168..01ebb66b0 100644 --- a/src/expr/create_memory_table.rs +++ b/src/expr/create_memory_table.rs @@ -17,7 +17,7 @@ use std::fmt::{self, Display, Formatter}; -use datafusion_expr::CreateMemoryTable; +use datafusion::logical_expr::CreateMemoryTable; use pyo3::prelude::*; use crate::sql::logical::PyLogicalPlan; diff --git a/src/expr/create_view.rs b/src/expr/create_view.rs index febd723c5..d119f5c21 100644 --- a/src/expr/create_view.rs +++ b/src/expr/create_view.rs @@ -17,7 +17,7 @@ use std::fmt::{self, Display, Formatter}; -use datafusion_expr::{CreateView, DdlStatement, LogicalPlan}; +use datafusion::logical_expr::{CreateView, DdlStatement, LogicalPlan}; use pyo3::prelude::*; use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; diff --git a/src/expr/cross_join.rs b/src/expr/cross_join.rs index 68793f249..5bc202aac 100644 --- a/src/expr/cross_join.rs +++ b/src/expr/cross_join.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::logical_plan::CrossJoin; +use datafusion::logical_expr::logical_plan::CrossJoin; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/distinct.rs b/src/expr/distinct.rs index 5d3a0b459..061ab4824 100644 --- a/src/expr/distinct.rs +++ b/src/expr/distinct.rs @@ -17,7 +17,7 @@ use std::fmt::{self, Display, Formatter}; -use datafusion_expr::Distinct; +use datafusion::logical_expr::Distinct; use pyo3::prelude::*; use crate::sql::logical::PyLogicalPlan; diff --git a/src/expr/drop_table.rs b/src/expr/drop_table.rs index 2a8836db5..330156abe 100644 --- a/src/expr/drop_table.rs +++ b/src/expr/drop_table.rs @@ -17,7 +17,7 @@ use std::fmt::{self, Display, Formatter}; -use datafusion_expr::logical_plan::DropTable; +use datafusion::logical_expr::logical_plan::DropTable; use pyo3::prelude::*; use crate::sql::logical::PyLogicalPlan; diff --git a/src/expr/empty_relation.rs b/src/expr/empty_relation.rs index 0bc222e59..ce7163466 100644 --- a/src/expr/empty_relation.rs +++ b/src/expr/empty_relation.rs @@ -16,7 +16,7 @@ // under the License. use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; -use datafusion_expr::EmptyRelation; +use datafusion::logical_expr::EmptyRelation; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/exists.rs b/src/expr/exists.rs index fd2aa8c2f..693357836 100644 --- a/src/expr/exists.rs +++ b/src/expr/exists.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::expr::Exists; +use datafusion::logical_expr::expr::Exists; use pyo3::prelude::*; use super::subquery::PySubquery; diff --git a/src/expr/explain.rs b/src/expr/explain.rs index d5d6a7bbd..8e7fb8843 100644 --- a/src/expr/explain.rs +++ b/src/expr/explain.rs @@ -17,7 +17,7 @@ use std::fmt::{self, Display, Formatter}; -use datafusion_expr::{logical_plan::Explain, LogicalPlan}; +use datafusion::logical_expr::{logical_plan::Explain, LogicalPlan}; use pyo3::prelude::*; use crate::{common::df_schema::PyDFSchema, errors::py_type_err, sql::logical::PyLogicalPlan}; diff --git a/src/expr/extension.rs b/src/expr/extension.rs index 81a435c23..a29802b0b 100644 --- a/src/expr/extension.rs +++ b/src/expr/extension.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::Extension; +use datafusion::logical_expr::Extension; use pyo3::prelude::*; use crate::sql::logical::PyLogicalPlan; diff --git a/src/expr/filter.rs b/src/expr/filter.rs index 2def2f7d6..a6d8aa7ee 100644 --- a/src/expr/filter.rs +++ b/src/expr/filter.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::logical_plan::Filter; +use datafusion::logical_expr::logical_plan::Filter; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/grouping_set.rs b/src/expr/grouping_set.rs index b73932863..63a1c0b50 100644 --- a/src/expr/grouping_set.rs +++ b/src/expr/grouping_set.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::GroupingSet; +use datafusion::logical_expr::GroupingSet; use pyo3::prelude::*; #[pyclass(name = "GroupingSet", module = "datafusion.expr", subclass)] diff --git a/src/expr/in_list.rs b/src/expr/in_list.rs index c1a99a3c8..5dfd8d8eb 100644 --- a/src/expr/in_list.rs +++ b/src/expr/in_list.rs @@ -16,7 +16,7 @@ // under the License. use crate::expr::PyExpr; -use datafusion_expr::expr::InList; +use datafusion::logical_expr::expr::InList; use pyo3::prelude::*; #[pyclass(name = "InList", module = "datafusion.expr", subclass)] diff --git a/src/expr/in_subquery.rs b/src/expr/in_subquery.rs index 7dfafdbf0..306b68a6e 100644 --- a/src/expr/in_subquery.rs +++ b/src/expr/in_subquery.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::expr::InSubquery; +use datafusion::logical_expr::expr::InSubquery; use pyo3::prelude::*; use super::{subquery::PySubquery, PyExpr}; diff --git a/src/expr/indexed_field.rs b/src/expr/indexed_field.rs index e0dad6a4a..a22dc6b27 100644 --- a/src/expr/indexed_field.rs +++ b/src/expr/indexed_field.rs @@ -16,7 +16,7 @@ // under the License. use crate::expr::PyExpr; -use datafusion_expr::expr::{GetFieldAccess, GetIndexedField}; +use datafusion::logical_expr::expr::{GetFieldAccess, GetIndexedField}; use pyo3::prelude::*; use std::fmt::{Display, Formatter}; diff --git a/src/expr/join.rs b/src/expr/join.rs index a53ddd3ba..66e677f8a 100644 --- a/src/expr/join.rs +++ b/src/expr/join.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::logical_plan::{Join, JoinConstraint, JoinType}; +use datafusion::logical_expr::logical_plan::{Join, JoinConstraint, JoinType}; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/like.rs b/src/expr/like.rs index 6ed3c2467..2e1f060bd 100644 --- a/src/expr/like.rs +++ b/src/expr/like.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::expr::Like; +use datafusion::logical_expr::expr::Like; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/limit.rs b/src/expr/limit.rs index d7b3f4ca5..876e154c1 100644 --- a/src/expr/limit.rs +++ b/src/expr/limit.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::logical_plan::Limit; +use datafusion::logical_expr::logical_plan::Limit; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/literal.rs b/src/expr/literal.rs index 0333432f6..43084ba4b 100644 --- a/src/expr/literal.rs +++ b/src/expr/literal.rs @@ -16,7 +16,7 @@ // under the License. use crate::errors::DataFusionError; -use datafusion_common::ScalarValue; +use datafusion::common::ScalarValue; use pyo3::prelude::*; #[pyclass(name = "Literal", module = "datafusion.expr", subclass)] diff --git a/src/expr/placeholder.rs b/src/expr/placeholder.rs index ca75ce37e..4ac2c47e3 100644 --- a/src/expr/placeholder.rs +++ b/src/expr/placeholder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::expr::Placeholder; +use datafusion::logical_expr::expr::Placeholder; use pyo3::prelude::*; use crate::common::data_type::PyDataType; diff --git a/src/expr/projection.rs b/src/expr/projection.rs index 8c1423df4..36534fdb2 100644 --- a/src/expr/projection.rs +++ b/src/expr/projection.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::logical_plan::Projection; -use datafusion_expr::Expr; +use datafusion::logical_expr::logical_plan::Projection; +use datafusion::logical_expr::Expr; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/repartition.rs b/src/expr/repartition.rs index e3e14f878..4e680e181 100644 --- a/src/expr/repartition.rs +++ b/src/expr/repartition.rs @@ -17,7 +17,7 @@ use std::fmt::{self, Display, Formatter}; -use datafusion_expr::{logical_plan::Repartition, Expr, Partitioning}; +use datafusion::logical_expr::{logical_plan::Repartition, Expr, Partitioning}; use pyo3::prelude::*; use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; diff --git a/src/expr/scalar_subquery.rs b/src/expr/scalar_subquery.rs index c71bb9905..9d35f28a9 100644 --- a/src/expr/scalar_subquery.rs +++ b/src/expr/scalar_subquery.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::Subquery; +use datafusion::logical_expr::Subquery; use pyo3::prelude::*; use super::subquery::PySubquery; diff --git a/src/expr/signature.rs b/src/expr/signature.rs index 7882cebed..e85763555 100644 --- a/src/expr/signature.rs +++ b/src/expr/signature.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::{TypeSignature, Volatility}; +use datafusion::logical_expr::{TypeSignature, Volatility}; use pyo3::prelude::*; #[allow(dead_code)] diff --git a/src/expr/sort.rs b/src/expr/sort.rs index f9f9e5899..b31ebfe0b 100644 --- a/src/expr/sort.rs +++ b/src/expr/sort.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::DataFusionError; -use datafusion_expr::logical_plan::Sort; +use datafusion::common::DataFusionError; +use datafusion::logical_expr::logical_plan::Sort; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/sort_expr.rs b/src/expr/sort_expr.rs index 6a8a0cf0c..4299d1f71 100644 --- a/src/expr/sort_expr.rs +++ b/src/expr/sort_expr.rs @@ -16,7 +16,7 @@ // under the License. use crate::expr::PyExpr; -use datafusion_expr::SortExpr; +use datafusion::logical_expr::SortExpr; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/subquery.rs b/src/expr/subquery.rs index f6f7b7fe5..dac8d0a2b 100644 --- a/src/expr/subquery.rs +++ b/src/expr/subquery.rs @@ -17,7 +17,7 @@ use std::fmt::{self, Display, Formatter}; -use datafusion_expr::Subquery; +use datafusion::logical_expr::Subquery; use pyo3::prelude::*; use crate::sql::logical::PyLogicalPlan; diff --git a/src/expr/subquery_alias.rs b/src/expr/subquery_alias.rs index d3abd2e8c..a83cff96d 100644 --- a/src/expr/subquery_alias.rs +++ b/src/expr/subquery_alias.rs @@ -17,7 +17,7 @@ use std::fmt::{self, Display, Formatter}; -use datafusion_expr::SubqueryAlias; +use datafusion::logical_expr::SubqueryAlias; use pyo3::prelude::*; use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; diff --git a/src/expr/table_scan.rs b/src/expr/table_scan.rs index 386c43b6a..f61be7fe4 100644 --- a/src/expr/table_scan.rs +++ b/src/expr/table_scan.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::TableReference; -use datafusion_expr::logical_plan::TableScan; +use datafusion::common::TableReference; +use datafusion::logical_expr::logical_plan::TableScan; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/union.rs b/src/expr/union.rs index 98e8eaae6..62488d9a1 100644 --- a/src/expr/union.rs +++ b/src/expr/union.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::logical_plan::Union; +use datafusion::logical_expr::logical_plan::Union; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/unnest.rs b/src/expr/unnest.rs index 33fb82feb..adc705035 100644 --- a/src/expr/unnest.rs +++ b/src/expr/unnest.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::logical_plan::Unnest; +use datafusion::logical_expr::logical_plan::Unnest; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/unnest_expr.rs b/src/expr/unnest_expr.rs index a2f8230cc..2234d24b1 100644 --- a/src/expr/unnest_expr.rs +++ b/src/expr/unnest_expr.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::expr::Unnest; +use datafusion::logical_expr::expr::Unnest; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/expr/window.rs b/src/expr/window.rs index 7eb586082..f17a6dd9b 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::{DataFusionError, ScalarValue}; -use datafusion_expr::expr::WindowFunction; -use datafusion_expr::{Expr, Window, WindowFrame, WindowFrameBound, WindowFrameUnits}; +use datafusion::common::{DataFusionError, ScalarValue}; +use datafusion::logical_expr::expr::WindowFunction; +use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, WindowFrameUnits}; use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; diff --git a/src/functions.rs b/src/functions.rs index aed4de474..b5b003dfe 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -16,9 +16,9 @@ // under the License. use datafusion::functions_aggregate::all_default_aggregate_functions; -use datafusion_expr::window_function; -use datafusion_expr::ExprFunctionExt; -use datafusion_expr::WindowFrame; +use datafusion::logical_expr::window_function; +use datafusion::logical_expr::ExprFunctionExt; +use datafusion::logical_expr::WindowFrame; use pyo3::{prelude::*, wrap_pyfunction}; use crate::common::data_type::NullTreatment; @@ -28,13 +28,13 @@ use crate::expr::conditional_expr::PyCaseBuilder; use crate::expr::to_sort_expressions; use crate::expr::window::PyWindowFrame; use crate::expr::PyExpr; +use datafusion::common::{Column, ScalarValue, TableReference}; use datafusion::execution::FunctionRegistry; use datafusion::functions; use datafusion::functions_aggregate; -use datafusion_common::{Column, ScalarValue, TableReference}; -use datafusion_expr::expr::Alias; -use datafusion_expr::sqlparser::ast::NullTreatment as DFNullTreatment; -use datafusion_expr::{ +use datafusion::logical_expr::expr::Alias; +use datafusion::logical_expr::sqlparser::ast::NullTreatment as DFNullTreatment; +use datafusion::logical_expr::{ expr::{find_df_window_func, AggregateFunction, Sort, WindowFunction}, lit, Expr, WindowFunctionDefinition, }; @@ -376,7 +376,7 @@ pub fn last_value( #[pyfunction] fn in_list(expr: PyExpr, value: Vec, negated: bool) -> PyExpr { - datafusion_expr::in_list( + datafusion::logical_expr::in_list( expr.expr, value.into_iter().map(|x| x.expr).collect::>(), negated, @@ -386,14 +386,14 @@ fn in_list(expr: PyExpr, value: Vec, negated: bool) -> PyExpr { #[pyfunction] fn make_array(exprs: Vec) -> PyExpr { - datafusion_functions_nested::expr_fn::make_array(exprs.into_iter().map(|x| x.into()).collect()) + datafusion::functions_nested::expr_fn::make_array(exprs.into_iter().map(|x| x.into()).collect()) .into() } #[pyfunction] fn array_concat(exprs: Vec) -> PyExpr { let exprs = exprs.into_iter().map(|x| x.into()).collect(); - datafusion_functions_nested::expr_fn::array_concat(exprs).into() + datafusion::functions_nested::expr_fn::array_concat(exprs).into() } #[pyfunction] @@ -405,12 +405,13 @@ fn array_cat(exprs: Vec) -> PyExpr { fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { let index = ScalarValue::Int64(index); let index = Expr::Literal(index); - datafusion_functions_nested::expr_fn::array_position(array.into(), element.into(), index).into() + datafusion::functions_nested::expr_fn::array_position(array.into(), element.into(), index) + .into() } #[pyfunction] fn array_slice(array: PyExpr, begin: PyExpr, end: PyExpr, stride: Option) -> PyExpr { - datafusion_functions_nested::expr_fn::array_slice( + datafusion::functions_nested::expr_fn::array_slice( array.into(), begin.into(), end.into(), @@ -476,7 +477,7 @@ fn regexp_replace( #[pyfunction] fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { Ok(PyExpr { - expr: datafusion_expr::Expr::Sort(Sort { + expr: datafusion::logical_expr::Expr::Sort(Sort { expr: Box::new(expr.expr), asc, nulls_first, @@ -489,7 +490,7 @@ fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { fn alias(expr: PyExpr, name: &str) -> PyResult { let relation: Option = None; Ok(PyExpr { - expr: datafusion_expr::Expr::Alias(Alias::new(expr.expr, relation, name)), + expr: datafusion::logical_expr::Expr::Alias(Alias::new(expr.expr, relation, name)), }) } @@ -497,7 +498,7 @@ fn alias(expr: PyExpr, name: &str) -> PyResult { #[pyfunction] fn col(name: &str) -> PyResult { Ok(PyExpr { - expr: datafusion_expr::Expr::Column(Column { + expr: datafusion::logical_expr::Expr::Column(Column { relation: None, name: name.to_string(), }), @@ -527,7 +528,7 @@ fn count(expr: PyExpr, distinct: bool) -> PyResult { #[pyfunction] fn case(expr: PyExpr) -> PyResult { Ok(PyCaseBuilder { - case_builder: datafusion_expr::case(expr.expr), + case_builder: datafusion::logical_expr::case(expr.expr), }) } @@ -535,7 +536,7 @@ fn case(expr: PyExpr) -> PyResult { #[pyfunction] fn when(when: PyExpr, then: PyExpr) -> PyResult { Ok(PyCaseBuilder { - case_builder: datafusion_expr::when(when.expr, then.expr), + case_builder: datafusion::logical_expr::when(when.expr, then.expr), }) } @@ -622,7 +623,7 @@ fn window( .unwrap_or(WindowFrame::new(order_by.as_ref().map(|v| !v.is_empty()))); Ok(PyExpr { - expr: datafusion_expr::Expr::WindowFunction(WindowFunction { + expr: datafusion::logical_expr::Expr::WindowFunction(WindowFunction { fun, args: args.into_iter().map(|x| x.expr).collect::>(), partition_by: partition_by @@ -654,7 +655,7 @@ macro_rules! aggregate_function { #[pyfunction] #[pyo3(signature = (*args, distinct=false))] fn $NAME(args: Vec, distinct: bool) -> PyExpr { - let expr = datafusion_expr::Expr::AggregateFunction(AggregateFunction { + let expr = datafusion::logical_expr::Expr::AggregateFunction(AggregateFunction { func: $FUNC(), args: args.into_iter().map(|e| e.into()).collect(), distinct, @@ -724,7 +725,7 @@ macro_rules! array_fn { #[doc = $DOC] #[pyfunction] fn $FUNC($($arg: PyExpr),*) -> PyExpr { - datafusion_functions_nested::expr_fn::$FUNC($($arg.into()),*).into() + datafusion::functions_nested::expr_fn::$FUNC($($arg.into()),*).into() } }; } diff --git a/src/lib.rs b/src/lib.rs index 357eaacd9..e4cc24078 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,10 +21,10 @@ use pyo3::prelude::*; // Re-export Apache Arrow DataFusion dependencies pub use datafusion; -pub use datafusion_common; -pub use datafusion_expr; -pub use datafusion_optimizer; -pub use datafusion_sql; +pub use datafusion::common as datafusion_common; +pub use datafusion::logical_expr as datafusion_expr; +pub use datafusion::optimizer; +pub use datafusion::sql as datafusion_sql; #[cfg(feature = "substrait")] pub use datafusion_substrait; @@ -32,6 +32,7 @@ pub use datafusion_substrait; #[allow(clippy::borrow_deref_ref)] pub mod catalog; pub mod common; + #[allow(clippy::borrow_deref_ref)] mod config; #[allow(clippy::borrow_deref_ref)] diff --git a/src/pyarrow_filter_expression.rs b/src/pyarrow_filter_expression.rs index ff447e1ab..6e2a45e1a 100644 --- a/src/pyarrow_filter_expression.rs +++ b/src/pyarrow_filter_expression.rs @@ -22,8 +22,8 @@ use std::convert::TryFrom; use std::result::Result; use arrow::pyarrow::ToPyArrow; -use datafusion_common::{Column, ScalarValue}; -use datafusion_expr::{expr::InList, Between, BinaryExpr, Expr, Operator}; +use datafusion::common::{Column, ScalarValue}; +use datafusion::logical_expr::{expr::InList, Between, BinaryExpr, Expr, Operator}; use crate::errors::DataFusionError; diff --git a/src/sql/logical.rs b/src/sql/logical.rs index c4471f503..89655ab70 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -35,7 +35,7 @@ use crate::expr::subquery_alias::PySubqueryAlias; use crate::expr::table_scan::PyTableScan; use crate::expr::unnest::PyUnnest; use crate::expr::window::PyWindow; -use datafusion_expr::LogicalPlan; +use datafusion::logical_expr::LogicalPlan; use pyo3::prelude::*; use crate::expr::logical_node::LogicalNode; diff --git a/src/udaf.rs b/src/udaf.rs index 2041e5a74..a6aa59ac3 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -24,7 +24,9 @@ use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::common::ScalarValue; use datafusion::error::{DataFusionError, Result}; -use datafusion_expr::{create_udaf, Accumulator, AccumulatorFactoryFunction, AggregateUDF}; +use datafusion::logical_expr::{ + create_udaf, Accumulator, AccumulatorFactoryFunction, AggregateUDF, +}; use crate::expr::PyExpr; use crate::utils::parse_volatility; diff --git a/src/udf.rs b/src/udf.rs index 8f5ca30b1..8bd9021d4 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -23,9 +23,9 @@ use datafusion::arrow::array::{make_array, Array, ArrayData, ArrayRef}; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow}; use datafusion::error::DataFusionError; -use datafusion_expr::create_udf; -use datafusion_expr::function::ScalarFunctionImplementation; -use datafusion_expr::ScalarUDF; +use datafusion::logical_expr::create_udf; +use datafusion::logical_expr::function::ScalarFunctionImplementation; +use datafusion::logical_expr::ScalarUDF; use crate::expr::PyExpr; use crate::utils::parse_volatility; diff --git a/src/utils.rs b/src/utils.rs index 4334f86cd..0d72eaf75 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -17,7 +17,7 @@ use crate::errors::DataFusionError; use crate::TokioRuntime; -use datafusion_expr::Volatility; +use datafusion::logical_expr::Volatility; use pyo3::prelude::*; use std::future::Future; use tokio::runtime::Runtime; From 4ea0032ff890fb5ab0838e42b8b66ffbd521f4df Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Sat, 7 Sep 2024 06:07:27 -0500 Subject: [PATCH 030/248] add section to the contributor guide on separating python and rust code (#860) --- docs/source/contributor-guide/introduction.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst index 6de2b87bc..4457a898f 100644 --- a/docs/source/contributor-guide/introduction.rst +++ b/docs/source/contributor-guide/introduction.rst @@ -71,6 +71,17 @@ Our pre-commit hooks can be installed by running :code:`pre-commit install`, whi The pre-commit hooks can also be run adhoc without installing them by simply running :code:`pre-commit run --all-files` +Guidelines for Separating Python and Rust Code +---------------------------------------------- + +Version 40 of ``datafusion-python`` introduced ``python`` wrappers around the ``pyo3`` generated code to vastly improve the user experience. (See the `blog post `_ and `pull request `_ for more details.) + +Mostly, the ``python`` code is limited to pure wrappers with type hints and good docstrings, but there are a few reasons for when the code does more: + +1. Trivial aliases like :py:func:`~datafusion.functions.array_append` and :py:func:`~datafusion.functions.list_append`. +2. Simple type conversion, like from a ``path`` to a ``string`` of the path or from ``number`` to ``lit(number)``. +3. The additional code makes an API **much** more pythonic, like we do for :py:func:`~datafusion.functions.named_struct` (see `source code `_). + Update Dependencies ------------------- From fd8224e1d32282c1664575f32a2eb5d39ffaabba Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 9 Sep 2024 14:13:21 -0400 Subject: [PATCH 031/248] Update Aggregate functions to take builder parameters (#859) * Add NullTreatment enum wrapper and add filter option to approx_distinct * Small usability on aggregate * Adding documentation and additional unit test for approx_median * Update approx_percentil_cont with builder parameters it uses, which is filter but not distinct * Update approx_percentil_cont_with_weight with builder parameters it uses, which is filter but not distinct * Update array_agg to use aggregate options * Update builder options for avg aggregate function * move bit_and bit_or to use macro to generaty python fn * Update builder arguments for bitwise operators * Use macro for bool_and and bool_or * Update python wrapper for arguments appropriate to bool operators * Set corr to use macro for pyfunction * Update unit test to make it easier to debug * Update corr python wrapper to expose only builder parameters used * Update count and count_star to use macro for exposing * Update count and count_star with approprate aggregation options * Move covar_pop and covar_samp to use macro for aggregates * Updateing covar_pop and covar_samp with builder option * Use macro for last_value and move first_value to be near it * Update first_value and last_value with the builder parameters that are relevant * Remove grouping since it is not actually implemented upstream * Move median to use macro * Expose builder options for median * Expose nth value * Updating linear regression functions to use filter and macro * Update stddev and stddev_pop to use filter and macro * Expose string_agg * Add string_agg to python wrappers and add unit test * Switch sum to use macro in rust side and expose correct options in python wrapper * Use macro for exposing var_pop and var_samp * Add unit tests for filtering on var_pop and var_samp * Move approximation functions to use macro when possible * Update user documentation to explain in detail the options for aggregate functions * Update unit test to handle Python 3.10 * Clean up commented code --- .../common-operations/aggregations.rst | 206 ++++- python/datafusion/common.py | 15 +- python/datafusion/dataframe.py | 7 +- python/datafusion/expr.py | 4 +- python/datafusion/functions.py | 867 ++++++++++++++---- python/datafusion/tests/test_aggregation.py | 359 +++++++- python/datafusion/tests/test_functions.py | 99 +- .../datafusion/tests/test_wrapper_coverage.py | 11 + src/functions.rs | 521 ++++------- 9 files changed, 1470 insertions(+), 619 deletions(-) diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst index 7ad402210..8fee26a15 100644 --- a/docs/source/user-guide/common-operations/aggregations.rst +++ b/docs/source/user-guide/common-operations/aggregations.rst @@ -20,43 +20,205 @@ Aggregation ============ -An aggregate or aggregation is a function where the values of multiple rows are processed together to form a single summary value. -For performing an aggregation, DataFusion provides the :py:func:`~datafusion.dataframe.DataFrame.aggregate` +An aggregate or aggregation is a function where the values of multiple rows are processed together +to form a single summary value. For performing an aggregation, DataFusion provides the +:py:func:`~datafusion.dataframe.DataFrame.aggregate` .. ipython:: python + import urllib.request from datafusion import SessionContext - from datafusion import column, lit + from datafusion import col, lit from datafusion import functions as f - import random - ctx = SessionContext() - df = ctx.from_pydict( - { - "a": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], - "b": ["one", "one", "two", "three", "two", "two", "one", "three"], - "c": [random.randint(0, 100) for _ in range(8)], - "d": [random.random() for _ in range(8)], - }, - name="foo_bar" + urllib.request.urlretrieve( + "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", + "pokemon.csv", ) - col_a = column("a") - col_b = column("b") - col_c = column("c") - col_d = column("d") + ctx = SessionContext() + df = ctx.read_csv("pokemon.csv") + + col_type_1 = col('"Type 1"') + col_type_2 = col('"Type 2"') + col_speed = col('"Speed"') + col_attack = col('"Attack"') - df.aggregate([], [f.approx_distinct(col_c), f.approx_median(col_d), f.approx_percentile_cont(col_d, lit(0.5))]) + df.aggregate([col_type_1], [ + f.approx_distinct(col_speed).alias("Count"), + f.approx_median(col_speed).alias("Median Speed"), + f.approx_percentile_cont(col_speed, 0.9).alias("90% Speed")]) -When the :code:`group_by` list is empty the aggregation is done over the whole :class:`.DataFrame`. For grouping -the :code:`group_by` list must contain at least one column +When the :code:`group_by` list is empty the aggregation is done over the whole :class:`.DataFrame`. +For grouping the :code:`group_by` list must contain at least one column. .. ipython:: python - df.aggregate([col_a], [f.sum(col_c), f.max(col_d), f.min(col_d)]) + df.aggregate([col_type_1], [ + f.max(col_speed).alias("Max Speed"), + f.avg(col_speed).alias("Avg Speed"), + f.min(col_speed).alias("Min Speed")]) More than one column can be used for grouping .. ipython:: python - df.aggregate([col_a, col_b], [f.sum(col_c), f.max(col_d), f.min(col_d)]) + df.aggregate([col_type_1, col_type_2], [ + f.max(col_speed).alias("Max Speed"), + f.avg(col_speed).alias("Avg Speed"), + f.min(col_speed).alias("Min Speed")]) + + + +Setting Parameters +------------------ + +Each of the built in aggregate functions provides arguments for the parameters that affect their +operation. These can also be overridden using the builder approach to setting any of the following +parameters. When you use the builder, you must call ``build()`` to finish. For example, these two +expressions are equivalent. + +.. ipython:: python + + first_1 = f.first_value(col("a"), order_by=[col("a")]) + first_2 = f.first_value(col("a")).order_by(col("a")).build() + +Ordering +^^^^^^^^ + +You can control the order in which rows are processed by window functions by providing +a list of ``order_by`` functions for the ``order_by`` parameter. In the following example, we +sort the Pokemon by their attack in increasing order and take the first value, which gives us the +Pokemon with the smallest attack value in each ``Type 1``. + +.. ipython:: python + + df.aggregate( + [col('"Type 1"')], + [f.first_value( + col('"Name"'), + order_by=[col('"Attack"').sort(ascending=True)] + ).alias("Smallest Attack") + ]) + +Distinct +^^^^^^^^ + +When you set the parameter ``distinct`` to ``True``, then unique values will only be evaluated one +time each. Suppose we want to create an array of all of the ``Type 2`` for each ``Type 1`` of our +Pokemon set. Since there will be many entries of ``Type 2`` we only one each distinct value. + +.. ipython:: python + + df.aggregate([col_type_1], [f.array_agg(col_type_2, distinct=True).alias("Type 2 List")]) + +In the output of the above we can see that there are some ``Type 1`` for which the ``Type 2`` entry +is ``null``. In reality, we probably want to filter those out. We can do this in two ways. First, +we can filter DataFrame rows that have no ``Type 2``. If we do this, we might have some ``Type 1`` +entries entirely removed. The second is we can use the ``filter`` argument described below. + +.. ipython:: python + + df.filter(col_type_2.is_not_null()).aggregate([col_type_1], [f.array_agg(col_type_2, distinct=True).alias("Type 2 List")]) + + df.aggregate([col_type_1], [f.array_agg(col_type_2, distinct=True, filter=col_type_2.is_not_null()).alias("Type 2 List")]) + +Which approach you take should depend on your use case. + +Null Treatment +^^^^^^^^^^^^^^ + +This option allows you to either respect or ignore null values. + +One common usage for handling nulls is the case where you want to find the first value within a +partition. By setting the null treatment to ignore nulls, we can find the first non-null value +in our partition. + + +.. ipython:: python + + from datafusion.common import NullTreatment + + df.aggregate([col_type_1], [ + f.first_value( + col_type_2, + order_by=[col_attack], + null_treatment=NullTreatment.RESPECT_NULLS + ).alias("Lowest Attack Type 2")]) + + df.aggregate([col_type_1], [ + f.first_value( + col_type_2, + order_by=[col_attack], + null_treatment=NullTreatment.IGNORE_NULLS + ).alias("Lowest Attack Type 2")]) + +Filter +^^^^^^ + +Using the filter option is useful for filtering results to include in the aggregate function. It can +be seen in the example above on how this can be useful to only filter rows evaluated by the +aggregate function without filtering rows from the entire DataFrame. + +Filter takes a single expression. + +Suppose we want to find the speed values for only Pokemon that have low Attack values. + +.. ipython:: python + + df.aggregate([col_type_1], [ + f.avg(col_speed).alias("Avg Speed All"), + f.avg(col_speed, filter=col_attack < lit(50)).alias("Avg Speed Low Attack")]) + + +Aggregate Functions +------------------- + +The available aggregate functions are: + +1. Comparison Functions + - :py:func:`datafusion.functions.min` + - :py:func:`datafusion.functions.max` +2. Math Functions + - :py:func:`datafusion.functions.sum` + - :py:func:`datafusion.functions.avg` + - :py:func:`datafusion.functions.median` +3. Array Functions + - :py:func:`datafusion.functions.array_agg` +4. Logical Functions + - :py:func:`datafusion.functions.bit_and` + - :py:func:`datafusion.functions.bit_or` + - :py:func:`datafusion.functions.bit_xor` + - :py:func:`datafusion.functions.bool_and` + - :py:func:`datafusion.functions.bool_or` +5. Statistical Functions + - :py:func:`datafusion.functions.count` + - :py:func:`datafusion.functions.corr` + - :py:func:`datafusion.functions.covar_samp` + - :py:func:`datafusion.functions.covar_pop` + - :py:func:`datafusion.functions.stddev` + - :py:func:`datafusion.functions.stddev_pop` + - :py:func:`datafusion.functions.var_samp` + - :py:func:`datafusion.functions.var_pop` +6. Linear Regression Functions + - :py:func:`datafusion.functions.regr_count` + - :py:func:`datafusion.functions.regr_slope` + - :py:func:`datafusion.functions.regr_intercept` + - :py:func:`datafusion.functions.regr_r2` + - :py:func:`datafusion.functions.regr_avgx` + - :py:func:`datafusion.functions.regr_avgy` + - :py:func:`datafusion.functions.regr_sxx` + - :py:func:`datafusion.functions.regr_syy` + - :py:func:`datafusion.functions.regr_slope` +7. Positional Functions + - :py:func:`datafusion.functions.first_value` + - :py:func:`datafusion.functions.last_value` + - :py:func:`datafusion.functions.nth_value` +8. String Functions + - :py:func:`datafusion.functions.string_agg` +9. Approximation Functions + - :py:func:`datafusion.functions.approx_distinct` + - :py:func:`datafusion.functions.approx_median` + - :py:func:`datafusion.functions.approx_percentile_cont` + - :py:func:`datafusion.functions.approx_percentile_cont_with_weight` + diff --git a/python/datafusion/common.py b/python/datafusion/common.py index 225e33304..7db8333f2 100644 --- a/python/datafusion/common.py +++ b/python/datafusion/common.py @@ -17,13 +17,13 @@ """Common data types used throughout the DataFusion project.""" from ._internal import common as common_internal +from enum import Enum # TODO these should all have proper wrapper classes DFSchema = common_internal.DFSchema DataType = common_internal.DataType DataTypeMap = common_internal.DataTypeMap -NullTreatment = common_internal.NullTreatment PythonType = common_internal.PythonType RexType = common_internal.RexType SqlFunction = common_internal.SqlFunction @@ -47,3 +47,16 @@ "SqlStatistics", "SqlFunction", ] + + +class NullTreatment(Enum): + """Describe how null values are to be treated by functions. + + This is used primarily by aggregate and window functions. It can be set on + these functions using the builder approach described in + ref:`_window_functions` and ref:`_aggregation` in the online documentation. + + """ + + RESPECT_NULLS = common_internal.NullTreatment.RESPECT_NULLS + IGNORE_NULLS = common_internal.NullTreatment.IGNORE_NULLS diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 46b8fa1bd..56dff22a4 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -180,7 +180,9 @@ def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame: """ return DataFrame(self.df.with_column_renamed(old_name, new_name)) - def aggregate(self, group_by: list[Expr], aggs: list[Expr]) -> DataFrame: + def aggregate( + self, group_by: list[Expr] | Expr, aggs: list[Expr] | Expr + ) -> DataFrame: """Aggregates the rows of the current DataFrame. Args: @@ -190,6 +192,9 @@ def aggregate(self, group_by: list[Expr], aggs: list[Expr]) -> DataFrame: Returns: DataFrame after aggregation. """ + group_by = group_by if isinstance(group_by, list) else [group_by] + aggs = aggs if isinstance(aggs, list) else [aggs] + group_by = [e.expr for e in group_by] aggs = [e.expr for e in aggs] return DataFrame(self.df.aggregate(group_by, aggs)) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 7fa608037..bd6a86fb8 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -473,7 +473,7 @@ def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ - return ExprFuncBuilder(self.expr.null_treatment(null_treatment)) + return ExprFuncBuilder(self.expr.null_treatment(null_treatment.value)) def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: """Set the partitioning for a window function. @@ -518,7 +518,7 @@ def distinct(self) -> ExprFuncBuilder: def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: """Set how nulls are treated for either window or aggregate functions.""" - return ExprFuncBuilder(self.builder.null_treatment(null_treatment)) + return ExprFuncBuilder(self.builder.null_treatment(null_treatment.value)) def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: """Set partitioning for window functions.""" diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 97b4fe1d5..163ff04e4 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,9 +18,10 @@ from __future__ import annotations -from datafusion._internal import functions as f, common +from datafusion._internal import functions as f, expr as expr_internal from datafusion.expr import CaseBuilder, Expr, WindowFrame from datafusion.context import SessionContext +from datafusion.common import NullTreatment from typing import Any, Optional @@ -126,7 +127,6 @@ "floor", "from_unixtime", "gcd", - "grouping", "in_list", "initcap", "isnan", @@ -180,6 +180,7 @@ "named_struct", "nanvl", "now", + "nth_value", "nullif", "octet_length", "order_by", @@ -222,6 +223,7 @@ "stddev", "stddev_pop", "stddev_samp", + "string_agg", "strpos", "struct", "substr", @@ -244,6 +246,7 @@ "var", "var_pop", "var_samp", + "var_sample", "when", # Window Functions "window", @@ -258,6 +261,12 @@ ] +def expr_list_to_raw_expr_list( + expr_list: Optional[list[Expr]], +) -> Optional[list[expr_internal.Expr]]: + return [e.expr for e in expr_list] if expr_list is not None else None + + def isnan(expr: Expr) -> Expr: """Returns true if a given number is +NaN or -NaN otherwise returns false.""" return Expr(f.isnan(expr.expr)) @@ -358,9 +367,18 @@ def col(name: str) -> Expr: return Expr(f.col(name)) -def count_star() -> Expr: - """Create a COUNT(1) aggregate expression.""" - return Expr(f.count_star()) +def count_star(filter: Optional[Expr] = None) -> Expr: + """Create a COUNT(1) aggregate expression. + + This aggregate function will count all of the rows in the partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``distinct``, and ``null_treatment``. + + Args: + filter: If provided, only count rows for which the filter is True + """ + return count(Expr.literal(1), filter=filter) def case(expr: Expr) -> CaseBuilder: @@ -400,8 +418,8 @@ def window( df.select(functions.lag(col("a")).partition_by(col("b")).build()) """ args = [a.expr for a in args] - partition_by = [e.expr for e in partition_by] if partition_by is not None else None - order_by = [o.expr for o in order_by] if order_by is not None else None + partition_by = expr_list_to_raw_expr_list(partition_by) + order_by = expr_list_to_raw_expr_list(order_by) window_frame = window_frame.window_frame if window_frame is not None else None return Expr(f.window(name, args, partition_by, order_by, window_frame, ctx)) @@ -1486,291 +1504,788 @@ def flatten(array: Expr) -> Expr: # aggregate functions -def approx_distinct(expression: Expr) -> Expr: - """Returns the approximate number of distinct values.""" - return Expr(f.approx_distinct(expression.expr)) +def approx_distinct( + expression: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Returns the approximate number of distinct values. + + This aggregate function is similar to :py:func:`count` with distinct set, but it + will approximate the number of distinct entries. It may return significantly faster + than :py:func:`count` for some DataFrames. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Values to check for distinct entries + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.approx_distinct(expression.expr, filter=filter_raw)) + +def approx_median(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Returns the approximate median value. -def approx_median(arg: Expr, distinct: bool = False) -> Expr: - """Returns the approximate median value.""" - return Expr(f.approx_median(arg.expr, distinct=distinct)) + This aggregate function is similar to :py:func:`median`, but it will only + approximate the median. It may return significantly faster for some DataFrames. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by`` and ``null_treatment``, and ``distinct``. + + Args: + expression: Values to find the median for + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.approx_median(expression.expr, filter=filter_raw)) def approx_percentile_cont( expression: Expr, - percentile: Expr, - num_centroids: Expr | None = None, - distinct: bool = False, + percentile: float, + num_centroids: Optional[int] = None, + filter: Optional[Expr] = None, ) -> Expr: - """Returns the value that is approximately at a given percentile of ``expr``.""" - if num_centroids is None: - return Expr( - f.approx_percentile_cont( - expression.expr, percentile.expr, distinct=distinct, num_centroids=None - ) - ) + """Returns the value that is approximately at a given percentile of ``expr``. + + This aggregate function assumes the input values form a continuous distribution. + Suppose you have a DataFrame which consists of 100 different test scores. If you + called this function with a percentile of 0.9, it would return the value of the + test score that is above 90% of the other test scores. The returned value may be + between two of the values. + + This function uses the [t-digest](https://arxiv.org/abs/1902.04023) algorithm to + compute the percentil. You can limit the number of bins used in this algorithm by + setting the ``num_centroids`` parameter. + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Values for which to find the approximate percentile + percentile: This must be between 0.0 and 1.0, inclusive + num_centroids: Max bin size for the t-digest algorithm + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None return Expr( f.approx_percentile_cont( - expression.expr, - percentile.expr, - distinct=distinct, - num_centroids=num_centroids.expr, + expression.expr, percentile, num_centroids=num_centroids, filter=filter_raw ) ) def approx_percentile_cont_with_weight( - arg: Expr, weight: Expr, percentile: Expr, distinct: bool = False + expression: Expr, weight: Expr, percentile: float, filter: Optional[Expr] = None ) -> Expr: - """Returns the value of the approximate percentile. + """Returns the value of the weighted approximate percentile. + + This aggregate function is similar to :py:func:`approx_percentile_cont` except that + it uses the associated associated weights. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Values for which to find the approximate percentile + weight: Relative weight for each of the values in ``expression`` + percentile: This must be between 0.0 and 1.0, inclusive + filter: If provided, only compute against rows for which the filter is True - This function is similar to :py:func:`approx_percentile_cont` except that it uses - the associated associated weights. """ + filter_raw = filter.expr if filter is not None else None return Expr( f.approx_percentile_cont_with_weight( - arg.expr, weight.expr, percentile.expr, distinct=distinct + expression.expr, weight.expr, percentile, filter=filter_raw ) ) -def array_agg(arg: Expr, distinct: bool = False) -> Expr: - """Aggregate values into an array.""" - return Expr(f.array_agg(arg.expr, distinct=distinct)) +def array_agg( + expression: Expr, + distinct: bool = False, + filter: Optional[Expr] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Aggregate values into an array. + Currently ``distinct`` and ``order_by`` cannot be used together. As a work around, + consider :py:func:`array_sort` after aggregation. + [Issue Tracker](https://github.com/apache/datafusion/issues/12371) -def avg(arg: Expr, distinct: bool = False) -> Expr: - """Returns the average value.""" - return Expr(f.avg(arg.expr, distinct=distinct)) + If using the builder functions described in ref:`_aggregation` this function ignores + the option ``null_treatment``. + Args: + expression: Values to combine into an array + distinct: If True, a single entry for each distinct value will be in the result + filter: If provided, only compute against rows for which the filter is True + order_by: Order the resultant array values + """ + order_by_raw = expr_list_to_raw_expr_list(order_by) + filter_raw = filter.expr if filter is not None else None + + return Expr( + f.array_agg( + expression.expr, distinct=distinct, filter=filter_raw, order_by=order_by_raw + ) + ) -def corr(value1: Expr, value2: Expr, distinct: bool = False) -> Expr: - """Returns the correlation coefficient between ``value1`` and ``value2``.""" - return Expr(f.corr(value1.expr, value2.expr, distinct=distinct)) +def avg( + expression: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Returns the average value. -def count(args: Expr | list[Expr] | None = None, distinct: bool = False) -> Expr: - """Returns the number of rows that match the given arguments.""" - if args is None: - return count(Expr.literal(1), distinct=distinct) - if isinstance(args, list): - args = [arg.expr for arg in args] - elif isinstance(args, Expr): - args = [args.expr] - return Expr(f.count(*args, distinct=distinct)) + This aggregate function expects a numeric expression and will return a float. + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. -def covar(y: Expr, x: Expr) -> Expr: - """Computes the sample covariance. + Args: + expression: Values to combine into an array + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.avg(expression.expr, filter=filter_raw)) - This is an alias for :py:func:`covar_samp`. + +def corr(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: + """Returns the correlation coefficient between ``value1`` and ``value2``. + + This aggregate function expects both values to be numeric and will return a float. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + value_y: The dependent variable for correlation + value_x: The independent variable for correlation + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw)) + + +def count( + expressions: Expr | list[Expr] | None = None, + distinct: bool = False, + filter: Optional[Expr] = None, +) -> Expr: + """Returns the number of rows that match the given arguments. + + This aggregate function will count the non-null rows provided in the expression. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by`` and ``null_treatment``. + + Args: + expressions: Argument to perform bitwise calculation on + distinct: If True, a single entry for each distinct value will be in the result + filter: If provided, only compute against rows for which the filter is True """ - return covar_samp(y, x) + filter_raw = filter.expr if filter is not None else None + if expressions is None: + args = [Expr.literal(1).expr] + elif isinstance(expressions, list): + args = [arg.expr for arg in expressions] + else: + args = [expressions.expr] -def covar_pop(y: Expr, x: Expr) -> Expr: - """Computes the population covariance.""" - return Expr(f.covar_pop(y.expr, x.expr)) + return Expr(f.count(*args, distinct=distinct, filter=filter_raw)) -def covar_samp(y: Expr, x: Expr) -> Expr: - """Computes the sample covariance.""" - return Expr(f.covar_samp(y.expr, x.expr)) +def covar_pop(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the population covariance. + This aggregate function expects both values to be numeric and will return a float. -def grouping(arg: Expr, distinct: bool = False) -> Expr: - """Indicates if the expression is aggregated or not. + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. - Returns 1 if the value of the argument is aggregated, 0 if not. + Args: + value_y: The dependent variable for covariance + value_x: The independent variable for covariance + filter: If provided, only compute against rows for which the filter is True """ - return Expr(f.grouping(arg.expr, distinct=distinct)) + filter_raw = filter.expr if filter is not None else None + return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw)) -def max(arg: Expr, distinct: bool = False) -> Expr: - """Returns the maximum value of the argument.""" - return Expr(f.max(arg.expr, distinct=distinct)) +def covar_samp(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample covariance. + This aggregate function expects both values to be numeric and will return a float. -def mean(arg: Expr, distinct: bool = False) -> Expr: + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + value_y: The dependent variable for covariance + value_x: The independent variable for covariance + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw)) + + +def covar(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample covariance. + + This is an alias for :py:func:`covar_samp`. + """ + return covar_samp(value_y, value_x, filter) + + +def max(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Aggregate function that returns the maximum value of the argument. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: The value to find the maximum of + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.max(expression.expr, filter=filter_raw)) + + +def mean(expression: Expr, filter: Optional[Expr] = None) -> Expr: """Returns the average (mean) value of the argument. This is an alias for :py:func:`avg`. """ - return avg(arg, distinct) + return avg(expression, filter) + + +def median( + expression: Expr, distinct: bool = False, filter: Optional[Expr] = None +) -> Expr: + """Computes the median of a set of numbers. + + This aggregate function returns the median value of the expression for the given + aggregate function. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by`` and ``null_treatment``. + + Args: + expression: The value to compute the median of + distinct: If True, a single entry for each distinct value will be in the result + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.median(expression.expr, distinct=distinct, filter=filter_raw)) + +def min(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Returns the minimum value of the argument. -def median(arg: Expr) -> Expr: - """Computes the median of a set of numbers.""" - return Expr(f.median(arg.expr)) + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + Args: + expression: The value to find the minimum of + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.min(expression.expr, filter=filter_raw)) -def min(arg: Expr, distinct: bool = False) -> Expr: - """Returns the minimum value of the argument.""" - return Expr(f.min(arg.expr, distinct=distinct)) +def sum( + expression: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the sum of a set of numbers. -def sum(arg: Expr) -> Expr: - """Computes the sum of a set of numbers.""" - return Expr(f.sum(arg.expr)) + This aggregate function expects a numeric expression. + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. -def stddev(arg: Expr, distinct: bool = False) -> Expr: - """Computes the standard deviation of the argument.""" - return Expr(f.stddev(arg.expr, distinct=distinct)) + Args: + expression: Values to combine into an array + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.sum(expression.expr, filter=filter_raw)) -def stddev_pop(arg: Expr, distinct: bool = False) -> Expr: - """Computes the population standard deviation of the argument.""" - return Expr(f.stddev_pop(arg.expr, distinct=distinct)) +def stddev(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the standard deviation of the argument. + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. -def stddev_samp(arg: Expr, distinct: bool = False) -> Expr: + Args: + expression: The value to find the minimum of + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.stddev(expression.expr, filter=filter_raw)) + + +def stddev_pop(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the population standard deviation of the argument. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: The value to find the minimum of + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.stddev_pop(expression.expr, filter=filter_raw)) + + +def stddev_samp(arg: Expr, filter: Optional[Expr] = None) -> Expr: """Computes the sample standard deviation of the argument. This is an alias for :py:func:`stddev`. """ - return stddev(arg, distinct) + return stddev(arg, filter=filter) -def var(arg: Expr) -> Expr: +def var(expression: Expr, filter: Optional[Expr] = None) -> Expr: """Computes the sample variance of the argument. This is an alias for :py:func:`var_samp`. """ - return var_samp(arg) + return var_samp(expression, filter) + + +def var_pop(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the population variance of the argument. + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. -def var_pop(arg: Expr, distinct: bool = False) -> Expr: - """Computes the population variance of the argument.""" - return Expr(f.var_pop(arg.expr, distinct=distinct)) + Args: + expression: The variable to compute the variance for + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.var_pop(expression.expr, filter=filter_raw)) -def var_samp(arg: Expr) -> Expr: - """Computes the sample variance of the argument.""" - return Expr(f.var_samp(arg.expr)) +def var_samp(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample variance of the argument. + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. -def regr_avgx(y: Expr, x: Expr, distinct: bool = False) -> Expr: + Args: + expression: The variable to compute the variance for + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.var_sample(expression.expr, filter=filter_raw)) + + +def var_sample(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample variance of the argument. + + This is an alias for :py:func:`var_samp`. + """ + return var_samp(expression, filter) + + +def regr_avgx( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: """Computes the average of the independent variable ``x``. - Only non-null pairs of the inputs are evaluated. + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True """ - return Expr(f.regr_avgx(y.expr, x.expr, distinct)) + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_avgx(y.expr, x.expr, filter=filter_raw)) -def regr_avgy(y: Expr, x: Expr, distinct: bool = False) -> Expr: +def regr_avgy( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: """Computes the average of the dependent variable ``y``. - Only non-null pairs of the inputs are evaluated. + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_avgy(y.expr, x.expr, filter=filter_raw)) + + +def regr_count( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Counts the number of rows in which both expressions are not null. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_count(y.expr, x.expr, filter=filter_raw)) + + +def regr_intercept( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the intercept from the linear regression. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_intercept(y.expr, x.expr, filter=filter_raw)) + + +def regr_r2( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the R-squared value from linear regression. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_r2(y.expr, x.expr, filter=filter_raw)) + + +def regr_slope( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the slope from linear regression. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True """ - return Expr(f.regr_avgy(y.expr, x.expr, distinct)) + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_slope(y.expr, x.expr, filter=filter_raw)) -def regr_count(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Counts the number of rows in which both expressions are not null.""" - return Expr(f.regr_count(y.expr, x.expr, distinct)) +def regr_sxx( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the sum of squares of the independent variable ``x``. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.regr_sxx(y.expr, x.expr, filter=filter_raw)) -def regr_intercept(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the intercept from the linear regression.""" - return Expr(f.regr_intercept(y.expr, x.expr, distinct)) +def regr_sxy( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the sum of products of pairs of numbers. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. -def regr_r2(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the R-squared value from linear regression.""" - return Expr(f.regr_r2(y.expr, x.expr, distinct)) + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None -def regr_slope(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the slope from linear regression.""" - return Expr(f.regr_slope(y.expr, x.expr, distinct)) + return Expr(f.regr_sxy(y.expr, x.expr, filter=filter_raw)) -def regr_sxx(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the sum of squares of the independent variable ``x``.""" - return Expr(f.regr_sxx(y.expr, x.expr, distinct)) +def regr_syy( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the sum of squares of the dependent variable ``y``. + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. -def regr_sxy(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the sum of products of pairs of numbers.""" - return Expr(f.regr_sxy(y.expr, x.expr, distinct)) + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None -def regr_syy(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the sum of squares of the dependent variable ``y``.""" - return Expr(f.regr_syy(y.expr, x.expr, distinct)) + return Expr(f.regr_syy(y.expr, x.expr, filter=filter_raw)) def first_value( - arg: Expr, - distinct: bool = False, - filter: Optional[bool] = None, + expression: Expr, + filter: Optional[Expr] = None, order_by: Optional[list[Expr]] = None, - null_treatment: Optional[common.NullTreatment] = None, + null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, ) -> Expr: - """Returns the first value in a group of values.""" - order_by_cols = [e.expr for e in order_by] if order_by is not None else None + """Returns the first value in a group of values. + + This aggregate function will return the first value in the partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the option ``distinct``. + + Args: + expression: Argument to perform bitwise calculation on + filter: If provided, only compute against rows for which the filter is True + order_by: Set the ordering of the expression to evaluate + null_treatment: Assign whether to respect or ignull null values. + """ + order_by_raw = expr_list_to_raw_expr_list(order_by) + filter_raw = filter.expr if filter is not None else None return Expr( f.first_value( - arg.expr, - distinct=distinct, - filter=filter, - order_by=order_by_cols, - null_treatment=null_treatment, + expression.expr, + filter=filter_raw, + order_by=order_by_raw, + null_treatment=null_treatment.value, ) ) def last_value( - arg: Expr, - distinct: bool = False, - filter: Optional[bool] = None, + expression: Expr, + filter: Optional[Expr] = None, order_by: Optional[list[Expr]] = None, - null_treatment: Optional[common.NullTreatment] = None, + null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, ) -> Expr: """Returns the last value in a group of values. - To set parameters on this expression, use ``.order_by()``, ``.distinct()``, - ``.filter()``, or ``.null_treatment()``. + This aggregate function will return the last value in the partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the option ``distinct``. + + Args: + expression: Argument to perform bitwise calculation on + filter: If provided, only compute against rows for which the filter is True + order_by: Set the ordering of the expression to evaluate + null_treatment: Assign whether to respect or ignull null values. """ - order_by_cols = [e.expr for e in order_by] if order_by is not None else None + order_by_raw = expr_list_to_raw_expr_list(order_by) + filter_raw = filter.expr if filter is not None else None return Expr( f.last_value( - arg.expr, - distinct=distinct, - filter=filter, - order_by=order_by_cols, - null_treatment=null_treatment, + expression.expr, + filter=filter_raw, + order_by=order_by_raw, + null_treatment=null_treatment.value, + ) + ) + + +def nth_value( + expression: Expr, + n: int, + filter: Optional[Expr] = None, + order_by: Optional[list[Expr]] = None, + null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, +) -> Expr: + """Returns the n-th value in a group of values. + + This aggregate function will return the n-th value in the partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the option ``distinct``. + + Args: + expression: Argument to perform bitwise calculation on + n: Index of value to return. Starts at 1. + filter: If provided, only compute against rows for which the filter is True + order_by: Set the ordering of the expression to evaluate + null_treatment: Assign whether to respect or ignull null values. + """ + order_by_raw = expr_list_to_raw_expr_list(order_by) + filter_raw = filter.expr if filter is not None else None + + return Expr( + f.nth_value( + expression.expr, + n, + filter=filter_raw, + order_by=order_by_raw, + null_treatment=null_treatment.value, ) ) -def bit_and(arg: Expr, distinct: bool = False) -> Expr: - """Computes the bitwise AND of the argument.""" - return Expr(f.bit_and(arg.expr, distinct=distinct)) +def bit_and(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the bitwise AND of the argument. + + This aggregate function will bitwise compare every value in the input partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + Args: + expression: Argument to perform bitwise calculation on + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bit_and(expression.expr, filter=filter_raw)) -def bit_or(arg: Expr, distinct: bool = False) -> Expr: - """Computes the bitwise OR of the argument.""" - return Expr(f.bit_or(arg.expr, distinct=distinct)) +def bit_or(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the bitwise OR of the argument. -def bit_xor(arg: Expr, distinct: bool = False) -> Expr: - """Computes the bitwise XOR of the argument.""" - return Expr(f.bit_xor(arg.expr, distinct=distinct)) + This aggregate function will bitwise compare every value in the input partition. + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Argument to perform bitwise calculation on + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bit_or(expression.expr, filter=filter_raw)) -def bool_and(arg: Expr, distinct: bool = False) -> Expr: - """Computes the boolean AND of the argument.""" - return Expr(f.bool_and(arg.expr, distinct=distinct)) +def bit_xor( + expression: Expr, distinct: bool = False, filter: Optional[Expr] = None +) -> Expr: + """Computes the bitwise XOR of the argument. -def bool_or(arg: Expr, distinct: bool = False) -> Expr: - """Computes the boolean OR of the argument.""" - return Expr(f.bool_or(arg.expr, distinct=distinct)) + This aggregate function will bitwise compare every value in the input partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by`` and ``null_treatment``. + + Args: + expression: Argument to perform bitwise calculation on + distinct: If True, evaluate each unique value of expression only once + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bit_xor(expression.expr, distinct=distinct, filter=filter_raw)) + + +def bool_and(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the boolean AND of the argument. + + This aggregate function will compare every value in the input partition. These are + expected to be boolean values. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Argument to perform calculation on + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bool_and(expression.expr, filter=filter_raw)) + + +def bool_or(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the boolean OR of the argument. + + This aggregate function will compare every value in the input partition. These are + expected to be boolean values. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Argument to perform calculation on + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bool_or(expression.expr, filter=filter_raw)) def lead( @@ -2107,3 +2622,37 @@ def ntile( order_by=order_cols, ) ) + + +def string_agg( + expression: Expr, + delimiter: str, + filter: Optional[Expr] = None, + order_by: Optional[list[Expr]] = None, +) -> Expr: + """Concatenates the input strings. + + This aggregate function will concatenate input strings, ignoring null values, and + seperating them with the specified delimiter. Non-string values will be converted to + their string equivalents. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``distinct`` and ``null_treatment``. + + Args: + expression: Argument to perform bitwise calculation on + delimiter: Text to place between each value of expression + filter: If provided, only compute against rows for which the filter is True + order_by: Set the ordering of the expression to evaluate + """ + order_by_raw = expr_list_to_raw_expr_list(order_by) + filter_raw = filter.expr if filter is not None else None + + return Expr( + f.string_agg( + expression.expr, + delimiter, + filter=filter_raw, + order_by=order_by_raw, + ) + ) diff --git a/python/datafusion/tests/test_aggregation.py b/python/datafusion/tests/test_aggregation.py index ab653c403..243a8c3c9 100644 --- a/python/datafusion/tests/test_aggregation.py +++ b/python/datafusion/tests/test_aggregation.py @@ -21,6 +21,7 @@ from datafusion import SessionContext, column, lit from datafusion import functions as f +from datafusion.common import NullTreatment @pytest.fixture @@ -34,12 +35,30 @@ def df(): pa.array([4, 4, 6]), pa.array([9, 8, 5]), pa.array([True, True, False]), + pa.array([1, 2, None]), ], - names=["a", "b", "c", "d"], + names=["a", "b", "c", "d", "e"], ) return ctx.create_dataframe([[batch]]) +@pytest.fixture +def df_partitioned(): + ctx = SessionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [ + pa.array([0, 1, 2, 3, 4, 5, 6]), + pa.array([7, None, 7, 8, 9, None, 9]), + pa.array(["A", "A", "A", "A", "B", "B", "B"]), + ], + names=["a", "b", "c"], + ) + + return ctx.create_dataframe([[batch]]) + + @pytest.fixture def df_aggregate_100(): ctx = SessionContext() @@ -87,6 +106,7 @@ def df_aggregate_100(): ], ) def test_aggregation_stats(df, agg_expr, calc_expected): + df = df.select("a", "b", "c", "d") agg_df = df.aggregate([], [agg_expr]) result = agg_df.collect()[0] values_a, values_b, values_c, values_d = df.collect()[0] @@ -95,68 +115,323 @@ def test_aggregation_stats(df, agg_expr, calc_expected): @pytest.mark.parametrize( - "agg_expr, expected", + "agg_expr, expected, array_sort", [ - (f.approx_distinct(column("b")), pa.array([2], type=pa.uint64())), - (f.approx_median(column("b")), pa.array([4])), - (f.approx_percentile_cont(column("b"), lit(0.5)), pa.array([4])), + (f.approx_distinct(column("b")), pa.array([2], type=pa.uint64()), False), + ( + f.approx_distinct( + column("b"), + filter=column("a") != lit(3), + ), + pa.array([1], type=pa.uint64()), + False, + ), + (f.approx_median(column("b")), pa.array([4]), False), + (f.median(column("b"), distinct=True), pa.array([5]), False), + (f.median(column("b"), filter=column("a") != 2), pa.array([5]), False), + (f.approx_median(column("b"), filter=column("a") != 2), pa.array([5]), False), + (f.approx_percentile_cont(column("b"), 0.5), pa.array([4]), False), ( - f.approx_percentile_cont_with_weight(column("b"), lit(0.6), lit(0.5)), + f.approx_percentile_cont_with_weight(column("b"), lit(0.6), 0.5), pa.array([6], type=pa.float64()), + False, + ), + ( + f.approx_percentile_cont_with_weight( + column("b"), lit(0.6), 0.5, filter=column("a") != lit(3) + ), + pa.array([4], type=pa.float64()), + False, + ), + (f.array_agg(column("b")), pa.array([[4, 4, 6]]), False), + (f.array_agg(column("b"), distinct=True), pa.array([[4, 6]]), True), + ( + f.array_agg(column("e"), filter=column("e").is_not_null()), + pa.array([[1, 2]]), + False, + ), + ( + f.array_agg(column("b"), order_by=[column("c")]), + pa.array([[6, 4, 4]]), + False, + ), + (f.avg(column("b"), filter=column("a") != lit(1)), pa.array([5.0]), False), + (f.sum(column("b"), filter=column("a") != lit(1)), pa.array([10]), False), + (f.count(column("b"), distinct=True), pa.array([2]), False), + (f.count(column("b"), filter=column("a") != 3), pa.array([2]), False), + (f.count(), pa.array([3]), False), + (f.count(column("e")), pa.array([2]), False), + (f.count_star(filter=column("a") != 3), pa.array([2]), False), + (f.max(column("a"), filter=column("a") != lit(3)), pa.array([2]), False), + (f.min(column("a"), filter=column("a") != lit(1)), pa.array([2]), False), + ( + f.stddev(column("a"), filter=column("a") != lit(2)), + pa.array([np.sqrt(2)]), + False, + ), + ( + f.stddev_pop(column("a"), filter=column("a") != lit(2)), + pa.array([1.0]), + False, ), - (f.array_agg(column("b")), pa.array([[4, 4, 6]])), ], ) -def test_aggregation(df, agg_expr, expected): - agg_df = df.aggregate([], [agg_expr]) +def test_aggregation(df, agg_expr, expected, array_sort): + agg_df = df.aggregate([], [agg_expr.alias("agg_expr")]) + if array_sort: + agg_df = agg_df.select(f.array_sort(column("agg_expr"))) + agg_df.show() result = agg_df.collect()[0] + + print(result) assert result.column(0) == expected -def test_aggregate_100(df_aggregate_100): +@pytest.mark.parametrize( + "name,expr,expected", + [ + ( + "approx_percentile_cont", + f.approx_percentile_cont(column("c3"), 0.95, num_centroids=200), + [73, 68, 122, 124, 115], + ), + ( + "approx_perc_cont_few_centroids", + f.approx_percentile_cont(column("c3"), 0.95, num_centroids=5), + [72, 68, 119, 124, 115], + ), + ( + "approx_perc_cont_filtered", + f.approx_percentile_cont( + column("c3"), 0.95, num_centroids=200, filter=column("c3") > lit(0) + ), + [83, 68, 122, 124, 117], + ), + ( + "corr", + f.corr(column("c3"), column("c2")), + [-0.1056, -0.2808, 0.0023, 0.0022, -0.2473], + ), + ( + "corr_w_filter", + f.corr(column("c3"), column("c2"), filter=column("c3") > lit(0)), + [-0.3298, 0.2925, 0.2467, -0.2269, 0.0358], + ), + ( + "covar_pop", + f.covar_pop(column("c3"), column("c2")), + [-7.2857, -25.6731, 0.2222, 0.2469, -20.2857], + ), + ( + "covar_pop_w_filter", + f.covar_pop(column("c3"), column("c2"), filter=column("c3") > lit(0)), + [-9.25, 9.0579, 13.7521, -9.9669, 1.1641], + ), + ( + "covar_samp", + f.covar_samp(column("c3"), column("c2")), + [-7.65, -27.0994, 0.2333, 0.2614, -21.3], + ), + ( + "covar_samp_w_filter", + f.covar_samp(column("c3"), column("c2"), filter=column("c3") > lit(0)), + [-10.5714, 9.9636, 15.1273, -10.9636, 1.2417], + ), + ( + "var_samp", + f.var_samp(column("c2")), + [1.9286, 2.2047, 1.6333, 2.1438, 1.6], + ), + ( + "var_samp_w_filter", + f.var_samp(column("c2"), filter=column("c3") > lit(0)), + [1.4286, 2.4182, 1.8545, 1.4727, 1.6292], + ), + ( + "var_pop", + f.var_pop(column("c2")), + [1.8367, 2.0886, 1.5556, 2.0247, 1.5238], + ), + ( + "var_pop_w_filter", + f.var_pop(column("c2"), filter=column("c3") > lit(0)), + [1.25, 2.1983, 1.686, 1.3388, 1.5273], + ), + ], +) +def test_aggregate_100(df_aggregate_100, name, expr, expected): # https://github.com/apache/datafusion/blob/bddb6415a50746d2803dd908d19c3758952d74f9/datafusion/sqllogictest/test_files/aggregate.slt#L1490-L1498 - result = ( + df = ( df_aggregate_100.aggregate( [column("c1")], - [f.approx_percentile_cont(column("c3"), lit(0.95), lit(200)).alias("c3")], + [expr.alias(name)], ) + .select("c1", f.round(column(name), lit(4)).alias(name)) .sort(column("c1").sort(ascending=True)) - .collect() ) + df.show() - assert len(result) == 1 - result = result[0] - assert result.column("c1") == pa.array(["a", "b", "c", "d", "e"]) - assert result.column("c3") == pa.array([73, 68, 122, 124, 115]) + expected_dict = { + "c1": ["a", "b", "c", "d", "e"], + name: expected, + } + assert df.collect()[0].to_pydict() == expected_dict -def test_bit_add_or_xor(df): - df = df.aggregate( - [], - [ - f.bit_and(column("a")), - f.bit_or(column("b")), - f.bit_xor(column("c")), - ], - ) - result = df.collect() - result = result[0] - assert result.column(0) == pa.array([0]) - assert result.column(1) == pa.array([6]) - assert result.column(2) == pa.array([4]) +data_test_bitwise_and_boolean_functions = [ + ("bit_and", f.bit_and(column("a")), [0]), + ("bit_and_filter", f.bit_and(column("a"), filter=column("a") != lit(2)), [1]), + ("bit_or", f.bit_or(column("b")), [6]), + ("bit_or_filter", f.bit_or(column("b"), filter=column("a") != lit(3)), [4]), + ("bit_xor", f.bit_xor(column("c")), [4]), + ("bit_xor_distinct", f.bit_xor(column("b"), distinct=True), [2]), + ("bit_xor_filter", f.bit_xor(column("b"), filter=column("a") != lit(3)), [0]), + ( + "bit_xor_filter_distinct", + f.bit_xor(column("b"), distinct=True, filter=column("a") != lit(3)), + [4], + ), + ("bool_and", f.bool_and(column("d")), [False]), + ("bool_and_filter", f.bool_and(column("d"), filter=column("a") != lit(3)), [True]), + ("bool_or", f.bool_or(column("d")), [True]), + ("bool_or_filter", f.bool_or(column("d"), filter=column("a") == lit(3)), [False]), +] -def test_bool_and_or(df): - df = df.aggregate( - [], - [ - f.bool_and(column("d")), - f.bool_or(column("d")), - ], +@pytest.mark.parametrize("name,expr,result", data_test_bitwise_and_boolean_functions) +def test_bit_and_bool_fns(df, name, expr, result): + df = df.aggregate([], [expr.alias(name)]) + + expected = { + name: result, + } + + assert df.collect()[0].to_pydict() == expected + + +@pytest.mark.parametrize( + "name,expr,result", + [ + ("first_value", f.first_value(column("a")), [0, 4]), + ( + "first_value_ordered", + f.first_value(column("a"), order_by=[column("a").sort(ascending=False)]), + [3, 6], + ), + ( + "first_value_with_null", + f.first_value( + column("b"), + order_by=[column("b").sort(ascending=True)], + null_treatment=NullTreatment.RESPECT_NULLS, + ), + [None, None], + ), + ( + "first_value_ignore_null", + f.first_value( + column("b"), + order_by=[column("b").sort(ascending=True)], + null_treatment=NullTreatment.IGNORE_NULLS, + ), + [7, 9], + ), + ("last_value", f.last_value(column("a")), [3, 6]), + ( + "last_value_ordered", + f.last_value(column("a"), order_by=[column("a").sort(ascending=False)]), + [0, 4], + ), + ( + "last_value_with_null", + f.last_value( + column("b"), + order_by=[column("b").sort(ascending=True, nulls_first=False)], + null_treatment=NullTreatment.RESPECT_NULLS, + ), + [None, None], + ), + ( + "last_value_ignore_null", + f.last_value( + column("b"), + order_by=[column("b").sort(ascending=True)], + null_treatment=NullTreatment.IGNORE_NULLS, + ), + [8, 9], + ), + ("first_value", f.first_value(column("a")), [0, 4]), + ( + "nth_value_ordered", + f.nth_value(column("a"), 2, order_by=[column("a").sort(ascending=False)]), + [2, 5], + ), + ( + "nth_value_with_null", + f.nth_value( + column("b"), + 3, + order_by=[column("b").sort(ascending=True, nulls_first=False)], + null_treatment=NullTreatment.RESPECT_NULLS, + ), + [8, None], + ), + ( + "nth_value_ignore_null", + f.nth_value( + column("b"), + 2, + order_by=[column("b").sort(ascending=True)], + null_treatment=NullTreatment.IGNORE_NULLS, + ), + [7, 9], + ), + ], +) +def test_first_last_value(df_partitioned, name, expr, result) -> None: + df = df_partitioned.aggregate([column("c")], [expr.alias(name)]).sort(column("c")) + + expected = { + "c": ["A", "B"], + name: result, + } + + assert df.collect()[0].to_pydict() == expected + + +@pytest.mark.parametrize( + "name,expr,result", + [ + ("string_agg", f.string_agg(column("a"), ","), "one,two,three,two"), + ("string_agg", f.string_agg(column("b"), ""), "03124"), + ( + "string_agg", + f.string_agg(column("a"), ",", filter=column("b") != lit(3)), + "one,three,two", + ), + ( + "string_agg", + f.string_agg(column("a"), ",", order_by=[column("b")]), + "one,three,two,two", + ), + ], +) +def test_string_agg(name, expr, result) -> None: + ctx = SessionContext() + + df = ctx.from_pydict( + { + "a": ["one", "two", None, "three", "two"], + "b": [0, 3, 1, 2, 4], + } ) - result = df.collect() - result = result[0] - assert result.column(0) == pa.array([False]) - assert result.column(1) == pa.array([True]) + + df = df.aggregate([], [expr.alias(name)]) + + expected = { + name: [result], + } + df.show() + assert df.collect()[0].to_pydict() == expected diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index e7e6d79e1..8e3c51397 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -912,17 +912,64 @@ def test_regr_funcs_sql_2(): @pytest.mark.parametrize( "func, expected", [ - pytest.param(f.regr_slope, pa.array([2], type=pa.float64()), id="regr_slope"), + pytest.param(f.regr_slope(column("c2"), column("c1")), [4.6], id="regr_slope"), pytest.param( - f.regr_intercept, pa.array([0], type=pa.float64()), id="regr_intercept" + f.regr_slope(column("c2"), column("c1"), filter=column("c1") > literal(2)), + [8], + id="regr_slope_filter", + ), + pytest.param( + f.regr_intercept(column("c2"), column("c1")), [-4], id="regr_intercept" + ), + pytest.param( + f.regr_intercept( + column("c2"), column("c1"), filter=column("c1") > literal(2) + ), + [-16], + id="regr_intercept_filter", + ), + pytest.param(f.regr_count(column("c2"), column("c1")), [4], id="regr_count"), + pytest.param( + f.regr_count(column("c2"), column("c1"), filter=column("c1") > literal(2)), + [2], + id="regr_count_filter", + ), + pytest.param(f.regr_r2(column("c2"), column("c1")), [0.92], id="regr_r2"), + pytest.param( + f.regr_r2(column("c2"), column("c1"), filter=column("c1") > literal(2)), + [1.0], + id="regr_r2_filter", + ), + pytest.param(f.regr_avgx(column("c2"), column("c1")), [2.5], id="regr_avgx"), + pytest.param( + f.regr_avgx(column("c2"), column("c1"), filter=column("c1") > literal(2)), + [3.5], + id="regr_avgx_filter", + ), + pytest.param(f.regr_avgy(column("c2"), column("c1")), [7.5], id="regr_avgy"), + pytest.param( + f.regr_avgy(column("c2"), column("c1"), filter=column("c1") > literal(2)), + [12.0], + id="regr_avgy_filter", + ), + pytest.param(f.regr_sxx(column("c2"), column("c1")), [5.0], id="regr_sxx"), + pytest.param( + f.regr_sxx(column("c2"), column("c1"), filter=column("c1") > literal(2)), + [0.5], + id="regr_sxx_filter", + ), + pytest.param(f.regr_syy(column("c2"), column("c1")), [115.0], id="regr_syy"), + pytest.param( + f.regr_syy(column("c2"), column("c1"), filter=column("c1") > literal(2)), + [32.0], + id="regr_syy_filter", + ), + pytest.param(f.regr_sxy(column("c2"), column("c1")), [23.0], id="regr_sxy"), + pytest.param( + f.regr_sxy(column("c2"), column("c1"), filter=column("c1") > literal(2)), + [4.0], + id="regr_sxy_filter", ), - pytest.param(f.regr_count, pa.array([3], type=pa.uint64()), id="regr_count"), - pytest.param(f.regr_r2, pa.array([1], type=pa.float64()), id="regr_r2"), - pytest.param(f.regr_avgx, pa.array([2], type=pa.float64()), id="regr_avgx"), - pytest.param(f.regr_avgy, pa.array([4], type=pa.float64()), id="regr_avgy"), - pytest.param(f.regr_sxx, pa.array([2], type=pa.float64()), id="regr_sxx"), - pytest.param(f.regr_syy, pa.array([8], type=pa.float64()), id="regr_syy"), - pytest.param(f.regr_sxy, pa.array([4], type=pa.float64()), id="regr_sxy"), ], ) def test_regr_funcs_df(func, expected): @@ -932,38 +979,18 @@ def test_regr_funcs_df(func, expected): ctx = SessionContext() # Create a DataFrame - data = {"column1": [1, 2, 3], "column2": [2, 4, 6]} + data = {"c1": [1, 2, 3, 4, 5, None], "c2": [2, 4, 8, 16, None, 64]} df = ctx.from_pydict(data, name="test_table") # Perform the regression function using DataFrame API - result_df = df.aggregate([], [func(f.col("column2"), f.col("column1"))]).collect() - - # Assertion for DataFrame API result - assert result_df[0].column(0) == expected - + df = df.aggregate([], [func.alias("result")]) + df.show() -def test_first_last_value(df): - df = df.aggregate( - [], - [ - f.first_value(column("a")), - f.first_value(column("b")), - f.first_value(column("d")), - f.last_value(column("a")), - f.last_value(column("b")), - f.last_value(column("d")), - ], - ) + expected_dict = { + "result": expected, + } - result = df.collect() - result = result[0] - assert result.column(0) == pa.array(["Hello"]) - assert result.column(1) == pa.array([4]) - assert result.column(2) == pa.array([datetime(2022, 12, 31)]) - assert result.column(3) == pa.array(["!"]) - assert result.column(4) == pa.array([6]) - assert result.column(5) == pa.array([datetime(2020, 7, 2)]) - df.show() + assert df.collect()[0].to_pydict() == expected_dict def test_binary_string_functions(df): diff --git a/python/datafusion/tests/test_wrapper_coverage.py b/python/datafusion/tests/test_wrapper_coverage.py index 44b9ca831..4a47de2e1 100644 --- a/python/datafusion/tests/test_wrapper_coverage.py +++ b/python/datafusion/tests/test_wrapper_coverage.py @@ -20,8 +20,19 @@ import datafusion.object_store import datafusion.substrait +# EnumType introduced in 3.11. 3.10 and prior it was called EnumMeta. +try: + from enum import EnumType +except ImportError: + from enum import EnumMeta as EnumType + def missing_exports(internal_obj, wrapped_obj) -> None: + # Special case enums - just make sure they exist since dir() + # and other functions get overridden. + if isinstance(wrapped_obj, EnumType): + return + for attr in dir(internal_obj): assert attr in dir(wrapped_obj) diff --git a/src/functions.rs b/src/functions.rs index b5b003dfe..b9ca6301a 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -35,305 +35,27 @@ use datafusion::functions_aggregate; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::sqlparser::ast::NullTreatment as DFNullTreatment; use datafusion::logical_expr::{ - expr::{find_df_window_func, AggregateFunction, Sort, WindowFunction}, + expr::{find_df_window_func, Sort, WindowFunction}, lit, Expr, WindowFunctionDefinition, }; -#[pyfunction] -pub fn approx_distinct(expression: PyExpr) -> PyExpr { - functions_aggregate::expr_fn::approx_distinct(expression.expr).into() -} - -#[pyfunction] -pub fn approx_median(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::approx_median(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn approx_percentile_cont( - expression: PyExpr, - percentile: PyExpr, - distinct: bool, - num_centroids: Option, // enforces optional arguments at the end, currently -) -> PyResult { - let args = if let Some(num_centroids) = num_centroids { - vec![expression.expr, percentile.expr, num_centroids.expr] - } else { - vec![expression.expr, percentile.expr] - }; - let udaf = functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf(); - let expr = udaf.call(args); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn approx_percentile_cont_with_weight( - expression: PyExpr, - weight: PyExpr, - percentile: PyExpr, - distinct: bool, -) -> PyResult { - let expr = functions_aggregate::expr_fn::approx_percentile_cont_with_weight( - expression.expr, - weight.expr, - percentile.expr, - ); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn avg(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::avg(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn bit_and(expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::bit_and(expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn bit_or(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::bit_or(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn bit_xor(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::bit_xor(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn bool_and(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::bool_and(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn bool_or(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::bool_or(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn corr(y: PyExpr, x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::corr(y.expr, x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn grouping(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::grouping(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn sum(args: PyExpr) -> PyExpr { - functions_aggregate::expr_fn::sum(args.expr).into() -} - -#[pyfunction] -pub fn covar_samp(y: PyExpr, x: PyExpr) -> PyExpr { - functions_aggregate::expr_fn::covar_samp(y.expr, x.expr).into() -} - -#[pyfunction] -pub fn covar_pop(y: PyExpr, x: PyExpr) -> PyExpr { - functions_aggregate::expr_fn::covar_pop(y.expr, x.expr).into() -} - -#[pyfunction] -pub fn median(arg: PyExpr) -> PyExpr { - functions_aggregate::expr_fn::median(arg.expr).into() -} - -#[pyfunction] -pub fn stddev(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::stddev(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn stddev_pop(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::stddev_pop(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn var_samp(expression: PyExpr) -> PyExpr { - functions_aggregate::expr_fn::var_sample(expression.expr).into() -} - -#[pyfunction] -pub fn var_pop(expression: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::var_pop(expression.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn regr_avgx(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::regr_avgx(expr_y.expr, expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn regr_avgy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::regr_avgy(expr_y.expr, expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn regr_count(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::regr_count(expr_y.expr, expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn regr_intercept(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::regr_intercept(expr_y.expr, expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn regr_r2(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::regr_r2(expr_y.expr, expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn regr_slope(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::regr_slope(expr_y.expr, expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn regr_sxx(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::regr_sxx(expr_y.expr, expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn regr_sxy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::regr_sxy(expr_y.expr, expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - -#[pyfunction] -pub fn regr_syy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::regr_syy(expr_y.expr, expr_x.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - fn add_builder_fns_to_aggregate( agg_fn: Expr, - distinct: bool, + distinct: Option, filter: Option, order_by: Option>, null_treatment: Option, ) -> PyResult { // Since ExprFuncBuilder::new() is private, we can guarantee initializing - // a builder with an `order_by` default of empty vec - let order_by = order_by - .map(|x| x.into_iter().map(|x| x.expr).collect::>()) - .unwrap_or_default(); - let mut builder = agg_fn.order_by(order_by); + // a builder with an `null_treatment` with option None + let mut builder = agg_fn.null_treatment(None); - if distinct { + if let Some(order_by_cols) = order_by { + let order_by_cols = to_sort_expressions(order_by_cols); + builder = builder.order_by(order_by_cols); + } + + if let Some(true) = distinct { builder = builder.distinct(); } @@ -341,39 +63,11 @@ fn add_builder_fns_to_aggregate( builder = builder.filter(filter.expr); } - // would be nice if all the options builder methods accepted Option ... builder = builder.null_treatment(null_treatment.map(DFNullTreatment::from)); Ok(builder.build()?.into()) } -#[pyfunction] -pub fn first_value( - expr: PyExpr, - distinct: bool, - filter: Option, - order_by: Option>, - null_treatment: Option, -) -> PyResult { - // If we initialize the UDAF with order_by directly, then it gets over-written by the builder - let agg_fn = functions_aggregate::expr_fn::first_value(expr.expr, None); - - add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) -} - -#[pyfunction] -pub fn last_value( - expr: PyExpr, - distinct: bool, - filter: Option, - order_by: Option>, - null_treatment: Option, -) -> PyResult { - let agg_fn = functions_aggregate::expr_fn::last_value(vec![expr.expr]); - - add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) -} - #[pyfunction] fn in_list(expr: PyExpr, value: Vec, negated: bool) -> PyExpr { datafusion::logical_expr::in_list( @@ -505,25 +199,6 @@ fn col(name: &str) -> PyResult { }) } -// TODO: should we just expose this in python? -/// Create a COUNT(1) aggregate expression -#[pyfunction] -fn count_star() -> PyExpr { - functions_aggregate::expr_fn::count(lit(1)).into() -} - -/// Wrapper for [`functions_aggregate::expr_fn::count`] -/// Count the number of non-null values in the column -#[pyfunction] -fn count(expr: PyExpr, distinct: bool) -> PyResult { - let expr = functions_aggregate::expr_fn::count(expr.expr); - if distinct { - Ok(expr.distinct().build()?.into()) - } else { - Ok(expr.into()) - } -} - /// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression. #[pyfunction] fn case(expr: PyExpr) -> PyResult { @@ -646,24 +321,46 @@ fn window( }) } +// Generates a [pyo3] wrapper for associated aggregate functions. +// All of the builder options are exposed to the python internal +// function and we rely on the wrappers to only use those that +// are appropriate. macro_rules! aggregate_function { - ($NAME: ident, $FUNC: path) => { - aggregate_function!($NAME, $FUNC, stringify!($NAME)); + ($NAME: ident) => { + aggregate_function!($NAME, expr); }; - ($NAME: ident, $FUNC: path, $DOC: expr) => { - #[doc = $DOC] + ($NAME: ident, $($arg:ident)*) => { #[pyfunction] - #[pyo3(signature = (*args, distinct=false))] - fn $NAME(args: Vec, distinct: bool) -> PyExpr { - let expr = datafusion::logical_expr::Expr::AggregateFunction(AggregateFunction { - func: $FUNC(), - args: args.into_iter().map(|e| e.into()).collect(), - distinct, - filter: None, - order_by: None, - null_treatment: None, - }); - expr.into() + fn $NAME( + $($arg: PyExpr),*, + distinct: Option, + filter: Option, + order_by: Option>, + null_treatment: Option + ) -> PyResult { + let agg_fn = functions_aggregate::expr_fn::$NAME($($arg.into()),*); + + add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) + } + }; +} + +macro_rules! aggregate_function_vec_args { + ($NAME: ident) => { + aggregate_function_vec_args!($NAME, expr); + }; + ($NAME: ident, $($arg:ident)*) => { + #[pyfunction] + fn $NAME( + $($arg: PyExpr),*, + distinct: Option, + filter: Option, + order_by: Option>, + null_treatment: Option + ) -> PyResult { + let agg_fn = functions_aggregate::expr_fn::$NAME(vec![$($arg.into()),*]); + + add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) } }; } @@ -891,9 +588,120 @@ array_fn!(array_resize, array size value); array_fn!(flatten, array); array_fn!(range, start stop step); -aggregate_function!(array_agg, functions_aggregate::array_agg::array_agg_udaf); -aggregate_function!(max, functions_aggregate::min_max::max_udaf); -aggregate_function!(min, functions_aggregate::min_max::min_udaf); +aggregate_function!(array_agg); +aggregate_function!(max); +aggregate_function!(min); +aggregate_function!(avg); +aggregate_function!(sum); +aggregate_function!(bit_and); +aggregate_function!(bit_or); +aggregate_function!(bit_xor); +aggregate_function!(bool_and); +aggregate_function!(bool_or); +aggregate_function!(corr, y x); +aggregate_function!(count); +aggregate_function!(covar_samp, y x); +aggregate_function!(covar_pop, y x); +aggregate_function!(median); +aggregate_function!(regr_slope, y x); +aggregate_function!(regr_intercept, y x); +aggregate_function!(regr_count, y x); +aggregate_function!(regr_r2, y x); +aggregate_function!(regr_avgx, y x); +aggregate_function!(regr_avgy, y x); +aggregate_function!(regr_sxx, y x); +aggregate_function!(regr_syy, y x); +aggregate_function!(regr_sxy, y x); +aggregate_function!(stddev); +aggregate_function!(stddev_pop); +aggregate_function!(var_sample); +aggregate_function!(var_pop); +aggregate_function!(approx_distinct); +aggregate_function!(approx_median); + +// Code is commented out since grouping is not yet implemented +// https://github.com/apache/datafusion-python/issues/861 +// aggregate_function!(grouping); + +#[pyfunction] +pub fn approx_percentile_cont( + expression: PyExpr, + percentile: f64, + num_centroids: Option, // enforces optional arguments at the end, currently + filter: Option, +) -> PyResult { + let args = if let Some(num_centroids) = num_centroids { + vec![expression.expr, lit(percentile), lit(num_centroids)] + } else { + vec![expression.expr, lit(percentile)] + }; + let udaf = functions_aggregate::approx_percentile_cont::approx_percentile_cont_udaf(); + let agg_fn = udaf.call(args); + + add_builder_fns_to_aggregate(agg_fn, None, filter, None, None) +} + +#[pyfunction] +pub fn approx_percentile_cont_with_weight( + expression: PyExpr, + weight: PyExpr, + percentile: f64, + filter: Option, +) -> PyResult { + let agg_fn = functions_aggregate::expr_fn::approx_percentile_cont_with_weight( + expression.expr, + weight.expr, + lit(percentile), + ); + + add_builder_fns_to_aggregate(agg_fn, None, filter, None, None) +} + +aggregate_function_vec_args!(last_value); + +// We handle first_value explicitly because the signature expects an order_by +// https://github.com/apache/datafusion/issues/12376 +#[pyfunction] +pub fn first_value( + expr: PyExpr, + distinct: Option, + filter: Option, + order_by: Option>, + null_treatment: Option, +) -> PyResult { + // If we initialize the UDAF with order_by directly, then it gets over-written by the builder + let agg_fn = functions_aggregate::expr_fn::first_value(expr.expr, None); + + add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) +} + +// nth_value requires a non-expr argument +#[pyfunction] +pub fn nth_value( + expr: PyExpr, + n: i64, + distinct: Option, + filter: Option, + order_by: Option>, + null_treatment: Option, +) -> PyResult { + let agg_fn = datafusion::functions_aggregate::nth_value::nth_value(vec![expr.expr, lit(n)]); + add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) +} + +// string_agg requires a non-expr argument +#[pyfunction] +pub fn string_agg( + expr: PyExpr, + delimiter: String, + distinct: Option, + filter: Option, + order_by: Option>, + null_treatment: Option, +) -> PyResult { + let agg_fn = datafusion::functions_aggregate::string_agg::string_agg(expr.expr, lit(delimiter)); + add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) +} fn add_builder_fns_to_window( window_fn: Expr, @@ -1042,7 +850,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(cosh))?; m.add_wrapped(wrap_pyfunction!(cot))?; m.add_wrapped(wrap_pyfunction!(count))?; - m.add_wrapped(wrap_pyfunction!(count_star))?; m.add_wrapped(wrap_pyfunction!(covar_pop))?; m.add_wrapped(wrap_pyfunction!(covar_samp))?; m.add_wrapped(wrap_pyfunction!(current_date))?; @@ -1059,7 +866,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(floor))?; m.add_wrapped(wrap_pyfunction!(from_unixtime))?; m.add_wrapped(wrap_pyfunction!(gcd))?; - m.add_wrapped(wrap_pyfunction!(grouping))?; + // m.add_wrapped(wrap_pyfunction!(grouping))?; m.add_wrapped(wrap_pyfunction!(in_list))?; m.add_wrapped(wrap_pyfunction!(initcap))?; m.add_wrapped(wrap_pyfunction!(isnan))?; @@ -1113,6 +920,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(starts_with))?; m.add_wrapped(wrap_pyfunction!(stddev))?; m.add_wrapped(wrap_pyfunction!(stddev_pop))?; + m.add_wrapped(wrap_pyfunction!(string_agg))?; m.add_wrapped(wrap_pyfunction!(strpos))?; m.add_wrapped(wrap_pyfunction!(r#struct))?; // Use raw identifier since struct is a keyword m.add_wrapped(wrap_pyfunction!(substr))?; @@ -1134,7 +942,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(upper))?; m.add_wrapped(wrap_pyfunction!(self::uuid))?; // Use self to avoid name collision m.add_wrapped(wrap_pyfunction!(var_pop))?; - m.add_wrapped(wrap_pyfunction!(var_samp))?; + m.add_wrapped(wrap_pyfunction!(var_sample))?; m.add_wrapped(wrap_pyfunction!(window))?; m.add_wrapped(wrap_pyfunction!(regr_avgx))?; m.add_wrapped(wrap_pyfunction!(regr_avgy))?; @@ -1147,6 +955,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(regr_syy))?; m.add_wrapped(wrap_pyfunction!(first_value))?; m.add_wrapped(wrap_pyfunction!(last_value))?; + m.add_wrapped(wrap_pyfunction!(nth_value))?; m.add_wrapped(wrap_pyfunction!(bit_and))?; m.add_wrapped(wrap_pyfunction!(bit_or))?; m.add_wrapped(wrap_pyfunction!(bit_xor))?; From 89b77abf14e8e2e7ee2a83ab23717cb7c1491a02 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Mon, 9 Sep 2024 17:44:32 -0500 Subject: [PATCH 032/248] prepare release of `datafusion-python` 41.0.0 (#866) * bump datafusion-python version * generate changelog I used the following command, then pasted the output into CHANGELOG.md. ```console ./dev/release/generate-changelog.py 40.1.0-rc1 HEAD 41.0.0 ``` --- CHANGELOG.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ Cargo.lock | 2 +- Cargo.toml | 2 +- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 305af5720..56a2bfc79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,51 @@ # DataFusion Python Changelog +## [41.0.0](https://github.com/apache/datafusion-python/tree/41.0.0) (2024-09-09) + +This release consists of 19 commits from 6 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: enable list of paths for read_csv [#824](https://github.com/apache/datafusion-python/pull/824) (mesejo) +- feat: better exception and message for table not found [#851](https://github.com/apache/datafusion-python/pull/851) (mesejo) +- feat: make cast accept built-in Python types [#858](https://github.com/apache/datafusion-python/pull/858) (mesejo) + +**Other:** + +- chore: Prepare for 40.0.0 release [#801](https://github.com/apache/datafusion-python/pull/801) (andygrove) +- Add typing-extensions dependency to pyproject [#805](https://github.com/apache/datafusion-python/pull/805) (timsaucer) +- Upgrade deps to datafusion 41 [#802](https://github.com/apache/datafusion-python/pull/802) (Michael-J-Ward) +- Fix SessionContext init with only SessionConfig [#827](https://github.com/apache/datafusion-python/pull/827) (jcrist) +- build(deps): upgrade actions/{upload,download}-artifact@v3 to v4 [#829](https://github.com/apache/datafusion-python/pull/829) (Michael-J-Ward) +- Run ruff format in CI [#837](https://github.com/apache/datafusion-python/pull/837) (timsaucer) +- Add PyCapsule support for Arrow import and export [#825](https://github.com/apache/datafusion-python/pull/825) (timsaucer) +- Feature/expose when function [#836](https://github.com/apache/datafusion-python/pull/836) (timsaucer) +- Add Window Functions for use with function builder [#808](https://github.com/apache/datafusion-python/pull/808) (timsaucer) +- chore: fix typos [#844](https://github.com/apache/datafusion-python/pull/844) (mesejo) +- build(ci): use proper mac runners [#841](https://github.com/apache/datafusion-python/pull/841) (Michael-J-Ward) +- Set of small features [#839](https://github.com/apache/datafusion-python/pull/839) (timsaucer) +- chore: fix docstrings, typos [#852](https://github.com/apache/datafusion-python/pull/852) (mesejo) +- chore: Use datafusion re-exported dependencies [#856](https://github.com/apache/datafusion-python/pull/856) (emgeee) +- add guidelines on separating python and rust code [#860](https://github.com/apache/datafusion-python/pull/860) (Michael-J-Ward) +- Update Aggregate functions to take builder parameters [#859](https://github.com/apache/datafusion-python/pull/859) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Tim Saucer + 5 Daniel Mesejo + 4 Michael J Ward + 1 Andy Grove + 1 Jim Crist-Harif + 1 Matt Green +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + + ## [40.0.0](https://github.com/apache/datafusion-python/tree/40.0.0) (2024-08-09) This release consists of 18 commits from 4 contributors. See credits at the end of this changelog for more information. diff --git a/Cargo.lock b/Cargo.lock index 43ee3055d..d507321db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1067,7 +1067,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "40.0.0" +version = "41.0.0" dependencies = [ "arrow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index e9a4ababb..8635776e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "40.0.0" +version = "41.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] From 02d4453af1c18d7e31c21d60574456226b71793e Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Wed, 11 Sep 2024 19:28:10 +0200 Subject: [PATCH 033/248] feat: expose between (#868) closes #809 --- python/datafusion/expr.py | 16 ++++++++++++ python/datafusion/tests/test_functions.py | 31 +++++++++++++++++++++++ src/expr.rs | 11 ++++++++ 3 files changed, 58 insertions(+) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index bd6a86fb8..f6a51ce2a 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -393,6 +393,22 @@ def cast( return Expr(self.expr.cast(to)) + def between(self, low: Any, high: Any, negated: bool = False) -> Expr: + """Returns ``True`` if this expression is between a given range. + + Args: + low: lower bound of the range (inclusive). + high: higher bound of the range (inclusive). + negated: negates whether the expression is between a given range + """ + if not isinstance(low, Expr): + low = Expr.literal(low) + + if not isinstance(high, Expr): + high = Expr.literal(high) + + return Expr(self.expr.between(low.expr, high.expr, negated=negated)) + def rex_type(self) -> RexType: """Return the Rex Type of this expression. diff --git a/python/datafusion/tests/test_functions.py b/python/datafusion/tests/test_functions.py index 8e3c51397..9353f872d 100644 --- a/python/datafusion/tests/test_functions.py +++ b/python/datafusion/tests/test_functions.py @@ -1024,3 +1024,34 @@ def test_cast(df, python_datatype, name: str, expected): result = df.collect() result = result[0] assert result.column(0) == result.column(1) + + +@pytest.mark.parametrize( + "negated, low, high, expected", + [ + pytest.param(False, 3, 5, {"filtered": [4, 5]}), + pytest.param(False, 4, 5, {"filtered": [4, 5]}), + pytest.param(True, 3, 5, {"filtered": [6]}), + pytest.param(True, 4, 6, []), + ], +) +def test_between(df, negated, low, high, expected): + df = df.filter(column("b").between(low, high, negated=negated)).select( + column("b").alias("filtered") + ) + + actual = df.collect() + + if expected: + actual = actual[0].to_pydict() + assert actual == expected + else: + assert len(actual) == 0 # the rows are empty + + +def test_between_default(df): + df = df.filter(column("b").between(3, 5)).select(column("b").alias("filtered")) + expected = {"filtered": [4, 5]} + + actual = df.collect()[0].to_pydict() + assert actual == expected diff --git a/src/expr.rs b/src/expr.rs index ab16f2872..0e1a193f3 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -293,6 +293,17 @@ impl PyExpr { expr.into() } + #[pyo3(signature = (low, high, negated=false))] + pub fn between(&self, low: PyExpr, high: PyExpr, negated: bool) -> PyExpr { + let expr = Expr::Between(Between::new( + Box::new(self.expr.clone()), + negated, + Box::new(low.into()), + Box::new(high.into()), + )); + expr.into() + } + /// A Rex (Row Expression) specifies a single row of data. That specification /// could include user defined functions or types. RexType identifies the row /// as one of the possible valid `RexTypes`. From 6c8bf5f8aadaf5a3431da272ed6b3afa06efa8ec Mon Sep 17 00:00:00 2001 From: Matt Green Date: Tue, 17 Sep 2024 13:58:11 -0700 Subject: [PATCH 034/248] Upgrade datafusion (#867) * update dependencies * update get_logical_plan signature * remove row_number() function row_number was converted to a UDF in datafusion v42 https://github.com/apache/datafusion/pull/12030 This specific functionality needs to be added back in. * remove unneeded dependency * fix pyo3 warnings Implicit defaults for trailing optional arguments have been deprecated in pyo3 v0.22.0 https://github.com/PyO3/pyo3/pull/4078 * update object_store dependency * change PyExpr -> PySortExpr * comment out key.extract::<&PyTuple>() condition statement * change more instances of PyExpr > PySortExpr * update function signatures to use _bound versions * remove clone * Working through some of the sort requirement changes * remove unused import * expr.display_name is deprecated, used format!() + schema_name() instead * expr.canonical_name() is deprecated, use format!() expr instead * remove comment * fix tuple extraction in dataframe.__getitem__() * remove unneeded import * Add docstring comments to SortExpr python class * change extract() to downcast() Co-authored-by: Michael J Ward * deprecate Expr::display_name Ref: https://github.com/apache/datafusion/pull/11797 * fix lint errors * update datafusion commit hash * fix type in cargo file for arrow features * upgrade to datafusion 42 * cleanup --------- Co-authored-by: Tim Saucer Co-authored-by: Michael J Ward Co-authored-by: Michael-J-Ward --- Cargo.lock | 784 +++++++++--------- Cargo.toml | 17 +- python/datafusion/context.py | 13 +- python/datafusion/dataframe.py | 8 +- python/datafusion/expr.py | 83 +- python/datafusion/functions.py | 95 +-- python/datafusion/tests/test_expr.py | 34 +- python/datafusion/tests/test_sql.py | 11 +- .../datafusion/tests/test_wrapper_coverage.py | 5 +- src/common/data_type.rs | 8 +- src/common/schema.rs | 37 +- src/context.rs | 18 +- src/dataframe.rs | 14 +- src/dataset.rs | 2 +- src/dataset_exec.rs | 4 +- src/expr.rs | 41 +- src/expr/sort.rs | 6 +- src/expr/sort_expr.rs | 21 +- src/expr/window.rs | 5 +- src/functions.rs | 87 +- src/pyarrow_filter_expression.rs | 2 +- src/udf.rs | 10 +- 22 files changed, 710 insertions(+), 595 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d507321db..d06073b6f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,18 +4,18 @@ version = 3 [[package]] name = "addr2line" -version = "0.22.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" +checksum = "f5fb1d8e4442bd405fdfd1dacb42792696b0cf9cb15882e5d097b742a676d375" dependencies = [ "gimli", ] [[package]] -name = "adler" -version = "1.0.2" +name = "adler2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" [[package]] name = "adler32" @@ -84,9 +84,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.86" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" +checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" [[package]] name = "apache-avro" @@ -118,21 +118,21 @@ dependencies = [ [[package]] name = "arrayref" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d151e35f61089500b617991b791fc8bfd237ae50cd5950803758a179b41e67a" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05048a8932648b63f21c37d88b552ccc8a65afb6dfe9fc9f30ce79174c2e7a85" +checksum = "45aef0d9cf9a039bf6cd1acc451b137aca819977b0928dece52bd92811b640ba" dependencies = [ "arrow-arith", "arrow-array", @@ -152,9 +152,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d8a57966e43bfe9a3277984a14c24ec617ad874e4c0e1d2a1b083a39cfbf22c" +checksum = "03675e42d1560790f3524800e41403b40d0da1c793fe9528929fde06d8c7649a" dependencies = [ "arrow-array", "arrow-buffer", @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" +checksum = "cd2bf348cf9f02a5975c5962c7fa6dee107a2009a7b41ac5fb1a027e12dc033f" dependencies = [ "ahash", "arrow-buffer", @@ -184,9 +184,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c975484888fc95ec4a632cdc98be39c085b1bb518531b0c80c5d462063e5daa1" +checksum = "3092e37715f168976012ce52273c3989b5793b0db5f06cbaa246be25e5f0924d" dependencies = [ "bytes", "half", @@ -195,9 +195,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e" +checksum = "7ce1018bb710d502f9db06af026ed3561552e493e989a79d0d0f5d9cf267a785" dependencies = [ "arrow-array", "arrow-buffer", @@ -216,9 +216,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13c36dc5ddf8c128df19bab27898eea64bf9da2b555ec1cd17a8ff57fba9ec2" +checksum = "fd178575f45624d045e4ebee714e246a05d9652e41363ee3f57ec18cca97f740" dependencies = [ "arrow-array", "arrow-buffer", @@ -235,9 +235,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" +checksum = "4e4ac0c4ee79150afe067dc4857154b3ee9c1cd52b5f40d59a77306d0ed18d65" dependencies = [ "arrow-buffer", "arrow-schema", @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" +checksum = "bb307482348a1267f91b0912e962cd53440e5de0f7fb24c5f7b10da70b38c94a" dependencies = [ "arrow-array", "arrow-buffer", @@ -262,9 +262,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb22284c5a2a01d73cebfd88a33511a3234ab45d66086b2ca2d1228c3498e445" +checksum = "d24805ba326758effdd6f2cbdd482fcfab749544f21b134701add25b33f474e6" dependencies = [ "arrow-array", "arrow-buffer", @@ -282,9 +282,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42745f86b1ab99ef96d1c0bcf49180848a64fe2c7a7a0d945bc64fa2b21ba9bc" +checksum = "644046c479d80ae8ed02a7f1e1399072ea344ca6a7b0e293ab2d5d9ed924aa3b" dependencies = [ "arrow-array", "arrow-buffer", @@ -297,9 +297,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd09a518c602a55bd406bcc291a967b284cfa7a63edfbf8b897ea4748aad23c" +checksum = "a29791f8eb13b340ce35525b723f5f0df17ecb955599e11f65c2a94ab34e2efb" dependencies = [ "ahash", "arrow-array", @@ -311,18 +311,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" +checksum = "c85320a3a2facf2b2822b57aa9d6d9d55edb8aee0b6b5d3b8df158e503d10858" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "arrow-select" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" +checksum = "9cc7e6b582e23855fd1625ce46e51647aa440c20ea2e71b1d748e0839dd73cba" dependencies = [ "ahash", "arrow-array", @@ -334,9 +334,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dc1985b67cb45f6606a248ac2b4a288849f196bab8c657ea5589f47cdd55e6" +checksum = "0775b6567c66e56ded19b87a954b6b1beffbdd784ef95a3a2b03f59570c1d230" dependencies = [ "arrow-array", "arrow-buffer", @@ -375,18 +375,18 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] name = "async-trait" -version = "0.1.81" +version = "0.1.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" +checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -412,17 +412,17 @@ checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "backtrace" -version = "0.3.73" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" dependencies = [ "addr2line", - "cc", "cfg-if", "libc", "miniz_oxide", "object", "rustc-demangle", + "windows-targets", ] [[package]] @@ -460,9 +460,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9ec96fe9a81b5e365f9db71fe00edc4fe4ca2cc7dcb7861f0603012a7caa210" +checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" dependencies = [ "arrayref", "arrayvec", @@ -542,12 +542,13 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.7" +version = "1.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26a5c3fd7bfa1ce3897a3a3501d362b2d87b7f2583ebcb4a949ec25911025cbc" +checksum = "45bcde016d64c21da4be18b655631e5ab6d3107607e71a73a9f53eb48aae23fb" dependencies = [ "jobserver", "libc", + "shlex", ] [[package]] @@ -566,7 +567,7 @@ dependencies = [ "iana-time-zone", "num-traits", "serde", - "windows-targets 0.52.6", + "windows-targets", ] [[package]] @@ -593,9 +594,9 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.50" +version = "0.1.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" dependencies = [ "cc", ] @@ -633,9 +634,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "core-foundation" @@ -649,9 +650,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "core2" @@ -664,9 +665,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" dependencies = [ "libc", ] @@ -731,9 +732,9 @@ checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" [[package]] name = "dashmap" -version = "6.0.1" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804c8821570c3f8b70230c2ba75ffa5c0f9a4189b9a432b6656c536712acae28" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", @@ -745,9 +746,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4fd4a99fc70d40ef7e52b243b4a399c3f8d353a40d5ecb200deee05e49c61bb" +checksum = "ee907b081e45e1d14e1f327e89ef134f91fcebad0bfc2dc229fa9f6044379682" dependencies = [ "ahash", "apache-avro", @@ -769,6 +770,7 @@ dependencies = [ "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", + "datafusion-functions-window", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -781,7 +783,7 @@ dependencies = [ "half", "hashbrown", "indexmap", - "itertools 0.12.1", + "itertools", "log", "num-traits", "num_cpus", @@ -803,9 +805,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13b3cfbd84c6003594ae1972314e3df303a27ce8ce755fcea3240c90f4c0529" +checksum = "6c2b914f6e33c429af7d8696c72a47ed9225d7e2b82c747ebdfa2408ed53579f" dependencies = [ "arrow-schema", "async-trait", @@ -813,13 +815,14 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "parking_lot", ] [[package]] name = "datafusion-common" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44fdbc877e3e40dcf88cc8f283d9f5c8851f0a3aa07fee657b1b75ac1ad49b9c" +checksum = "3a84f8e76330c582a6b8ada0b2c599ca46cfe46b7585e458fc3f4092bc722a18" dependencies = [ "ahash", "apache-avro", @@ -835,24 +838,27 @@ dependencies = [ "num_cpus", "object_store", "parquet", + "paste", "pyo3", "sqlparser", + "tokio", ] [[package]] name = "datafusion-common-runtime" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a7496d1f664179f6ce3a5cbef6566056ccaf3ea4aa72cc455f80e62c1dd86b1" +checksum = "cf08cc30d92720d557df13bd5a5696213bd5ea0f38a866d8d85055d866fba774" dependencies = [ + "log", "tokio", ] [[package]] name = "datafusion-execution" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799e70968c815b611116951e3dd876aef04bf217da31b72eec01ee6a959336a1" +checksum = "86bc4183d5c45b9f068a6f351678a0d1eb1225181424542bb75db18ec280b822" dependencies = [ "arrow", "chrono", @@ -871,9 +877,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c1841c409d9518c17971d15c9bae62e629eb937e6fb6c68cd32e9186f8b30d2" +checksum = "202119ce58e4d103e37ae64aab40d4e574c97bdd2bea994bf307b175fcbfa74d" dependencies = [ "ahash", "arrow", @@ -881,6 +887,9 @@ dependencies = [ "arrow-buffer", "chrono", "datafusion-common", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", "paste", "serde_json", "sqlparser", @@ -888,11 +897,22 @@ dependencies = [ "strum_macros 0.26.4", ] +[[package]] +name = "datafusion-expr-common" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8b181ce8569216abb01ef3294aa16c0a40d7d39350c2ff01ede00f167a535f2" +dependencies = [ + "arrow", + "datafusion-common", + "paste", +] + [[package]] name = "datafusion-functions" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8e481cf34d2a444bd8fa09b65945f0ce83dc92df8665b761505b3d9f351bebb" +checksum = "6e4124b8066444e05a24472f852e94cf56546c0f4d92d00f018f207216902712" dependencies = [ "arrow", "arrow-buffer", @@ -905,7 +925,7 @@ dependencies = [ "datafusion-expr", "hashbrown", "hex", - "itertools 0.12.1", + "itertools", "log", "md-5", "rand", @@ -917,9 +937,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b4ece19f73c02727e5e8654d79cd5652de371352c1df3c4ac3e419ecd6943fb" +checksum = "b94acdac235ea21810150a89751617ef2db7e32eba27f54be48a81bde2bfe119" dependencies = [ "ahash", "arrow", @@ -927,17 +947,34 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", "datafusion-physical-expr-common", + "half", "log", "paste", "sqlparser", ] +[[package]] +name = "datafusion-functions-aggregate-common" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c9ea085bbf900bf16e2ca0f56fc56236b2e4f2e1a2cccb67bcd83c5ab4ad0ef" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", + "rand", +] + [[package]] name = "datafusion-functions-nested" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1474552cc824e8c9c88177d454db5781d4b66757d4aca75719306b8343a5e8d" +checksum = "6c882e61665ed60c5ce9b061c1e587aeb8ae5ae4bcb5e5f2465139ab25328e0f" dependencies = [ "arrow", "arrow-array", @@ -949,17 +986,30 @@ dependencies = [ "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", - "itertools 0.12.1", + "datafusion-physical-expr-common", + "itertools", "log", "paste", "rand", ] +[[package]] +name = "datafusion-functions-window" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98a354ce96df3ca6d025093adac9fd55ca09931c9b6f2630140721a95873fde4" +dependencies = [ + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr-common", + "log", +] + [[package]] name = "datafusion-optimizer" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "791ff56f55608bc542d1ea7a68a64bdc86a9413f5a381d06a39fd49c2a3ab906" +checksum = "baf677c74fb7b5a1899ef52709e4a70fff3ed80bdfb4bbe495909810e83d5f39" dependencies = [ "arrow", "async-trait", @@ -969,7 +1019,7 @@ dependencies = [ "datafusion-physical-expr", "hashbrown", "indexmap", - "itertools 0.12.1", + "itertools", "log", "paste", "regex-syntax", @@ -977,9 +1027,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a223962b3041304a3e20ed07a21d5de3d88d7e4e71ca192135db6d24e3365a4" +checksum = "30b077999f6eb6c43d6b25bc66332a3be2f693c382840f008dd763b8540f9530" dependencies = [ "ahash", "arrow", @@ -993,12 +1043,14 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", "hashbrown", "hex", "indexmap", - "itertools 0.12.1", + "itertools", "log", "paste", "petgraph", @@ -1007,35 +1059,37 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db5e7d8532a1601cd916881db87a70b0a599900d23f3db2897d389032da53bc6" +checksum = "dce847f885c2b13bbe29f5c8b7948797131aa470af6e16d2a94f4428b4f4f1bd" dependencies = [ "ahash", "arrow", "datafusion-common", - "datafusion-expr", + "datafusion-expr-common", "hashbrown", "rand", ] [[package]] name = "datafusion-physical-optimizer" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb9c78f308e050f5004671039786a925c3fee83b90004e9fcfd328d7febdcc0" +checksum = "d13238e3b9fdd62a4c18760bfef714bb990d1e1d3430e9f416aae4b3cfaa71af" dependencies = [ + "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-physical-expr", "datafusion-physical-plan", + "itertools", ] [[package]] name = "datafusion-physical-plan" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d1116949432eb2d30f6362707e2846d942e491052a206f2ddcb42d08aea1ffe" +checksum = "faba6f55a7eaf0241d07d12c2640de52742646b10f754485d5192bdfe2c9ceae" dependencies = [ "ahash", "arrow", @@ -1050,13 +1104,14 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", "datafusion-physical-expr", "datafusion-physical-expr-common", "futures", "half", "hashbrown", "indexmap", - "itertools 0.12.1", + "itertools", "log", "once_cell", "parking_lot", @@ -1083,7 +1138,7 @@ dependencies = [ "pyo3-build-config", "rand", "regex-syntax", - "syn 2.0.72", + "syn", "tokio", "url", "uuid", @@ -1091,9 +1146,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45d0180711165fe94015d7c4123eb3e1cf5fb60b1506453200b8d1ce666bef0" +checksum = "dad8d96a9b52e1aa24f9373696a815be828193efce7cb0bbd2140b6bb67d1819" dependencies = [ "arrow", "arrow-array", @@ -1108,15 +1163,15 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "41.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf0a0055aa98246c79f98f0d03df11f16cb7adc87818d02d4413e3f3cdadbbee" +checksum = "f92b1b80e98bf5a9921bf118816e0e766d18527e343153321fcccfe4d68c5c45" dependencies = [ "arrow-buffer", "async-recursion", "chrono", "datafusion", - "itertools 0.12.1", + "itertools", "object_store", "pbjson-types", "prost", @@ -1135,12 +1190,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "doc-comment" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" - [[package]] name = "dyn-clone" version = "1.0.17" @@ -1171,9 +1220,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "fixedbitset" @@ -1193,9 +1242,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.31" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" dependencies = [ "crc32fast", "miniz_oxide", @@ -1272,7 +1321,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -1328,9 +1377,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.29.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" +checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64" [[package]] name = "glob" @@ -1340,9 +1389,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" dependencies = [ "atomic-waker", "bytes", @@ -1470,16 +1519,16 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.2" +version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" +checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", "http", "hyper", "hyper-util", "rustls", - "rustls-native-certs", + "rustls-native-certs 0.8.0", "rustls-pki-types", "tokio", "tokio-rustls", @@ -1488,9 +1537,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" +checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba" dependencies = [ "bytes", "futures-channel", @@ -1508,9 +1557,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.60" +version = "0.1.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -1541,9 +1590,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.3.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3fc2e30ba82dd1b3911c8de1ffc143c74a914a14e99514d7637e3099df5ea0" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", @@ -1575,27 +1624,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" - -[[package]] -name = "itertools" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" -dependencies = [ - "either", -] - -[[package]] -name = "itertools" -version = "0.12.1" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] +checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4" [[package]] name = "itertools" @@ -1623,9 +1654,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" dependencies = [ "wasm-bindgen", ] @@ -1702,9 +1733,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.155" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libflate" @@ -1830,18 +1861,18 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.7.4" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" dependencies = [ - "adler", + "adler2", ] [[package]] name = "mio" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4569e456d394deccd22ce1c1913e6ea0e54519f577285001215d33557431afe4" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ "hermit-abi", "libc", @@ -1941,18 +1972,18 @@ dependencies = [ [[package]] name = "object" -version = "0.36.2" +version = "0.36.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f203fa8daa7bb185f760ae12bd8e097f63d17041dcdcaf675ac54cdf863170e" +checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.10.2" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6da452820c715ce78221e8202ccc599b4a52f3e1eb3eedb487b680c81a8e3f3" +checksum = "25a0c4b3a0e31f8b66f71ad8064521efa773910196e2cde791436f13409f3b45" dependencies = [ "async-trait", "base64 0.22.1", @@ -1961,7 +1992,7 @@ dependencies = [ "futures", "humantime", "hyper", - "itertools 0.13.0", + "itertools", "md-5", "parking_lot", "percent-encoding", @@ -2020,14 +2051,14 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets 0.52.6", + "windows-targets", ] [[package]] name = "parquet" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e977b9066b4d3b03555c22bdc442f3fadebd96a39111249113087d0edb2691cd" +checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8" dependencies = [ "ahash", "arrow-array", @@ -2076,9 +2107,9 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pbjson" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1030c719b0ec2a2d25a5df729d6cff1acf3cc230bf766f4f97833591f7577b90" +checksum = "c7e6349fa080353f4a597daffd05cb81572a9c031a6d4fff7e504947496fcc68" dependencies = [ "base64 0.21.7", "serde", @@ -2086,21 +2117,21 @@ dependencies = [ [[package]] name = "pbjson-build" -version = "0.6.2" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2580e33f2292d34be285c5bc3dba5259542b083cfad6037b6d70345f24dcb735" +checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" dependencies = [ - "heck 0.4.1", - "itertools 0.11.0", + "heck 0.5.0", + "itertools", "prost", "prost-types", ] [[package]] name = "pbjson-types" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18f596653ba4ac51bdecbb4ef6773bc7f56042dc13927910de1684ad3d32aa12" +checksum = "e54e5e7bfb1652f95bc361d76f3c780d8e526b134b85417e774166ee941f0887" dependencies = [ "bytes", "chrono", @@ -2182,7 +2213,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -2220,12 +2251,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.20" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" +checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba" dependencies = [ "proc-macro2", - "syn 2.0.72", + "syn", ] [[package]] @@ -2239,9 +2270,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.12.6" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +checksum = "3b2ecbe40f08db5c006b5764a2645f7f3f141ce756412ac9e1dd6087e6d32995" dependencies = [ "bytes", "prost-derive", @@ -2249,13 +2280,13 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.12.6" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +checksum = "f8650aabb6c35b860610e9cff5dc1af886c9e25073b7b1712a68972af4281302" dependencies = [ "bytes", "heck 0.5.0", - "itertools 0.12.1", + "itertools", "log", "multimap", "once_cell", @@ -2264,28 +2295,28 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.72", + "syn", "tempfile", ] [[package]] name = "prost-derive" -version = "0.12.6" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +checksum = "acf0c195eebb4af52c752bec4f52f645da98b6e92077a04110c7f349477ae5ac" dependencies = [ "anyhow", - "itertools 0.12.1", + "itertools", "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] name = "prost-types" -version = "0.12.6" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +checksum = "60caa6738c7369b940c3d49246a8d1749323674c65cb13010134f5c9bad5b519" dependencies = [ "prost", ] @@ -2301,15 +2332,15 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8" +checksum = "15ee168e30649f7f234c3d49ef5a7a6cbf5134289bc46c29ff3155fa3221c225" dependencies = [ "cfg-if", "indoc", "libc", "memoffset", - "parking_lot", + "once_cell", "portable-atomic", "pyo3-build-config", "pyo3-ffi", @@ -2319,9 +2350,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50" +checksum = "e61cef80755fe9e46bb8a0b8f20752ca7676dcc07a5277d8b7768c6172e529b3" dependencies = [ "once_cell", "target-lexicon", @@ -2329,9 +2360,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403" +checksum = "67ce096073ec5405f5ee2b8b31f03a68e02aa10d5d4f565eca04acc41931fa1c" dependencies = [ "libc", "pyo3-build-config", @@ -2339,34 +2370,34 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c" +checksum = "2440c6d12bc8f3ae39f1e775266fa5122fd0c8891ce7520fa6048e683ad3de28" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.72", + "syn", ] [[package]] name = "pyo3-macros-backend" -version = "0.21.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c" +checksum = "1be962f0e06da8f8465729ea2cb71a416d2257dff56cbe40a70d3e62a93ae5d1" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.72", + "syn", ] [[package]] name = "quad-rand" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658fa1faf7a4cc5f057c9ee5ef560f717ad9d8dc66d975267f709624d6e1ab88" +checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db" [[package]] name = "quick-xml" @@ -2380,9 +2411,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.3" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" +checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" dependencies = [ "bytes", "pin-project-lite", @@ -2398,9 +2429,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.6" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" +checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" dependencies = [ "bytes", "rand", @@ -2415,22 +2446,22 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" +checksum = "4fe68c2e9e1a1234e218683dbdf9f9dfcb094113c5ac2b938dfcb9bab4c4140b" dependencies = [ "libc", "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -2467,9 +2498,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.3" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" +checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853" dependencies = [ "bitflags 2.6.0", ] @@ -2521,9 +2552,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" dependencies = [ "base64 0.22.1", "bytes", @@ -2545,7 +2576,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls", - "rustls-native-certs", + "rustls-native-certs 0.7.3", "rustls-pemfile", "rustls-pki-types", "serde", @@ -2561,7 +2592,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "winreg", + "windows-registry", ] [[package]] @@ -2599,18 +2630,18 @@ checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" [[package]] name = "rustc_version" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ "semver", ] [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" dependencies = [ "bitflags 2.6.0", "errno", @@ -2621,9 +2652,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.12" +version = "0.23.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044" +checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" dependencies = [ "once_cell", "ring", @@ -2635,9 +2666,22 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" +dependencies = [ + "openssl-probe", + "rustls-pemfile", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" +checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" dependencies = [ "openssl-probe", "rustls-pemfile", @@ -2664,9 +2708,9 @@ checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] name = "rustls-webpki" -version = "0.102.6" +version = "0.102.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ "ring", "rustls-pki-types", @@ -2696,11 +2740,11 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2724,7 +2768,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.72", + "syn", ] [[package]] @@ -2773,22 +2817,22 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.204" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" +checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.204" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" +checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -2799,14 +2843,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] name = "serde_json" -version = "1.0.122" +version = "1.0.128" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da" +checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" dependencies = [ "itoa", "memchr", @@ -2816,14 +2860,14 @@ dependencies = [ [[package]] name = "serde_tokenstream" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8790a7c3fe883e443eaa2af6f705952bc5d6e8671a220b9335c8cae92c037e74" +checksum = "64060d864397305347a78851c51588fd283767e7e7589829e8121d65512340f1" dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.72", + "syn", ] [[package]] @@ -2862,6 +2906,12 @@ dependencies = [ "digest", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "siphasher" version = "0.3.11" @@ -2885,24 +2935,23 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "snafu" -version = "0.7.5" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +checksum = "2b835cb902660db3415a672d862905e791e54d306c6e8189168c7f3d9ae1c79d" dependencies = [ - "doc-comment", "snafu-derive", ] [[package]] name = "snafu-derive" -version = "0.7.5" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +checksum = "38d1e02fca405f6280643174a50c942219f0bbf4dbf7d480f1dd864d6f211ae5" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "quote", - "syn 1.0.109", + "syn", ] [[package]] @@ -2929,9 +2978,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.49.0" +version = "0.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a404d0e14905361b918cb8afdb73605e25c1d5029312bd9785142dcb3aa49e" +checksum = "b2e5b515a2bd5168426033e9efbfd05500114833916f1d5c268f938b4ee130ac" dependencies = [ "log", "sqlparser_derive", @@ -2945,7 +2994,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -2979,7 +3028,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn", ] [[package]] @@ -2992,14 +3041,14 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn", ] [[package]] name = "substrait" -version = "0.36.0" +version = "0.41.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1ee6e584c8bf37104b7eb51c25eae07a9321b0e01379bec3b7c462d2f42afbf" +checksum = "2a3bf05f1d7a3fd7a97790d410f6e859b3a98dcde05e7a3fc00b31b0f60fe7cb" dependencies = [ "heck 0.5.0", "pbjson", @@ -3015,7 +3064,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.72", + "syn", "typify", "walkdir", ] @@ -3028,20 +3077,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.72" +version = "2.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" dependencies = [ "proc-macro2", "quote", @@ -3053,6 +3091,9 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] [[package]] name = "target-lexicon" @@ -3062,15 +3103,15 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.11.0" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fcd239983515c23a32fb82099f97d0b11b8c72f654ed659363a95c3dad7a53" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" dependencies = [ "cfg-if", "fastrand", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3090,7 +3131,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -3130,9 +3171,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.2" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" +checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" dependencies = [ "backtrace", "bytes", @@ -3152,7 +3193,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -3168,9 +3209,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.11" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" dependencies = [ "bytes", "futures-core", @@ -3196,15 +3237,15 @@ dependencies = [ [[package]] name = "tower-layer" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -3225,7 +3266,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -3270,7 +3311,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -3304,7 +3345,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.72", + "syn", "thiserror", "unicode-ident", ] @@ -3322,7 +3363,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.72", + "syn", "typify-impl", ] @@ -3334,24 +3375,24 @@ checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-ident" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] [[package]] name = "unicode-segmentation" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" @@ -3431,34 +3472,35 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" dependencies = [ "cfg-if", + "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.72", + "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" dependencies = [ "cfg-if", "js-sys", @@ -3468,9 +3510,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3478,22 +3520,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "wasm-streams" @@ -3510,9 +3552,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" dependencies = [ "js-sys", "wasm-bindgen", @@ -3533,49 +3575,55 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.6", + "windows-targets", ] [[package]] -name = "windows-sys" -version = "0.48.0" +name = "windows-registry" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" dependencies = [ - "windows-targets 0.48.5", + "windows-result", + "windows-strings", + "windows-targets", ] [[package]] -name = "windows-sys" -version = "0.52.0" +name = "windows-result" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" dependencies = [ - "windows-targets 0.52.6", + "windows-result", + "windows-targets", ] [[package]] name = "windows-sys" -version = "0.59.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.6", + "windows-targets", ] [[package]] -name = "windows-targets" -version = "0.48.5" +name = "windows-sys" +version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] @@ -3584,46 +3632,28 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.6", - "windows_aarch64_msvc 0.52.6", - "windows_i686_gnu 0.52.6", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", "windows_i686_gnullvm", - "windows_i686_msvc 0.52.6", - "windows_x86_64_gnu 0.52.6", - "windows_x86_64_gnullvm 0.52.6", - "windows_x86_64_msvc 0.52.6", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -3636,64 +3666,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "xz2" version = "0.1.7" @@ -3721,7 +3717,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn", ] [[package]] @@ -3769,9 +3765,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.12+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index 8635776e2..11dccc4fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,24 +36,24 @@ substrait = ["dep:datafusion-substrait"] [dependencies] tokio = { version = "1.39", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.8" -pyo3 = { version = "0.21", features = ["extension-module", "abi3", "abi3-py38"] } -arrow = { version = "52", feature = ["pyarrow"] } -datafusion = { version = "41.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } -datafusion-substrait = { version = "41.0.0", optional = true } -prost = "0.12" # keep in line with `datafusion-substrait` -prost-types = "0.12" # keep in line with `datafusion-substrait` +pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } +arrow = { version = "53", features = ["pyarrow"] } +datafusion = { version = "42.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } +datafusion-substrait = { version = "42.0.0", optional = true } +prost = "0.13" # keep in line with `datafusion-substrait` +prost-types = "0.13" # keep in line with `datafusion-substrait` uuid = { version = "1.9", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } async-trait = "0.1" futures = "0.3" -object_store = { version = "0.10.1", features = ["aws", "gcp", "azure"] } +object_store = { version = "0.11.0", features = ["aws", "gcp", "azure"] } parking_lot = "0.12" regex-syntax = "0.8" syn = "2.0.68" url = "2" [build-dependencies] -pyo3-build-config = "0.21" +pyo3-build-config = "0.22" [lib] name = "datafusion_python" @@ -62,4 +62,3 @@ crate-type = ["cdylib", "rlib"] [profile.release] lto = true codegen-units = 1 - diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 903d4a107..35a40ccd4 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -28,7 +28,7 @@ from datafusion._internal import AggregateUDF from datafusion.catalog import Catalog, Table from datafusion.dataframe import DataFrame -from datafusion.expr import Expr +from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list from datafusion.record_batch import RecordBatchStream from datafusion.udf import ScalarUDF @@ -466,7 +466,7 @@ def register_listing_table( table_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".parquet", schema: pyarrow.Schema | None = None, - file_sort_order: list[list[Expr]] | None = None, + file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> None: """Register multiple files as a single table. @@ -484,15 +484,18 @@ def register_listing_table( """ if table_partition_cols is None: table_partition_cols = [] - if file_sort_order is not None: - file_sort_order = [[x.expr for x in xs] for xs in file_sort_order] + file_sort_order_raw = ( + [sort_list_to_raw_sort_list(f) for f in file_sort_order] + if file_sort_order is not None + else None + ) self.ctx.register_listing_table( name, str(path), table_partition_cols, file_extension, schema, - file_sort_order, + file_sort_order_raw, ) def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 56dff22a4..2328ef8fa 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -33,7 +33,7 @@ from typing import Callable from datafusion._internal import DataFrame as DataFrameInternal -from datafusion.expr import Expr +from datafusion.expr import Expr, SortExpr, sort_or_default from datafusion._internal import ( LogicalPlan, ExecutionPlan, @@ -199,7 +199,7 @@ def aggregate( aggs = [e.expr for e in aggs] return DataFrame(self.df.aggregate(group_by, aggs)) - def sort(self, *exprs: Expr) -> DataFrame: + def sort(self, *exprs: Expr | SortExpr) -> DataFrame: """Sort the DataFrame by the specified sorting expressions. Note that any expression can be turned into a sort expression by @@ -211,8 +211,8 @@ def sort(self, *exprs: Expr) -> DataFrame: Returns: DataFrame after sorting. """ - exprs = [expr.expr for expr in exprs] - return DataFrame(self.df.sort(*exprs)) + exprs_raw = [sort_or_default(expr) for expr in exprs] + return DataFrame(self.df.sort(*exprs_raw)) def limit(self, count: int, offset: int = 0) -> DataFrame: """Return a new :py:class:`DataFrame` with a limited number of rows. diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index f6a51ce2a..fd5e6f04a 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -22,14 +22,15 @@ from __future__ import annotations -from ._internal import ( - expr as expr_internal, - LogicalPlan, - functions as functions_internal, -) -from datafusion.common import NullTreatment, RexType, DataTypeMap from typing import Any, Optional, Type + import pyarrow as pa +from datafusion.common import DataTypeMap, NullTreatment, RexType +from typing_extensions import deprecated + +from ._internal import LogicalPlan +from ._internal import expr as expr_internal +from ._internal import functions as functions_internal # The following are imported from the internal representation. We may choose to # give these all proper wrappers, or to simply leave as is. These were added @@ -84,7 +85,6 @@ ScalarVariable = expr_internal.ScalarVariable SimilarTo = expr_internal.SimilarTo Sort = expr_internal.Sort -SortExpr = expr_internal.SortExpr Subquery = expr_internal.Subquery SubqueryAlias = expr_internal.SubqueryAlias TableScan = expr_internal.TableScan @@ -159,6 +159,27 @@ ] +def expr_list_to_raw_expr_list( + expr_list: Optional[list[Expr]], +) -> Optional[list[expr_internal.Expr]]: + """Helper function to convert an optional list to raw expressions.""" + return [e.expr for e in expr_list] if expr_list is not None else None + + +def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr: + """Helper function to return a default Sort if an Expr is provided.""" + if isinstance(e, SortExpr): + return e.raw_sort + return SortExpr(e.expr, True, True).raw_sort + + +def sort_list_to_raw_sort_list( + sort_list: Optional[list[Expr | SortExpr]], +) -> Optional[list[expr_internal.SortExpr]]: + """Helper function to return an optional sort list to raw variant.""" + return [sort_or_default(e) for e in sort_list] if sort_list is not None else None + + class Expr: """Expression object. @@ -174,12 +195,22 @@ def to_variant(self) -> Any: """Convert this expression into a python object if possible.""" return self.expr.to_variant() + @deprecated( + "display_name() is deprecated. Use :py:meth:`~Expr.schema_name` instead" + ) def display_name(self) -> str: """Returns the name of this expression as it should appear in a schema. This name will not include any CAST expressions. """ - return self.expr.display_name() + return self.schema_name() + + def schema_name(self) -> str: + """Returns the name of this expression as it should appear in a schema. + + This name will not include any CAST expressions. + """ + return self.expr.schema_name() def canonical_name(self) -> str: """Returns a complete string representation of this expression.""" @@ -355,14 +386,14 @@ def alias(self, name: str) -> Expr: """Assign a name to the expression.""" return Expr(self.expr.alias(name)) - def sort(self, ascending: bool = True, nulls_first: bool = True) -> Expr: + def sort(self, ascending: bool = True, nulls_first: bool = True) -> SortExpr: """Creates a sort :py:class:`Expr` from an existing :py:class:`Expr`. Args: ascending: If true, sort in ascending order. nulls_first: Return null values first. """ - return Expr(self.expr.sort(ascending=ascending, nulls_first=nulls_first)) + return SortExpr(self.expr, ascending=ascending, nulls_first=nulls_first) def is_null(self) -> Expr: """Returns ``True`` if this expression is null.""" @@ -455,14 +486,14 @@ def column_name(self, plan: LogicalPlan) -> str: """Compute the output column name based on the provided logical plan.""" return self.expr.column_name(plan) - def order_by(self, *exprs: Expr) -> ExprFuncBuilder: + def order_by(self, *exprs: Expr | SortExpr) -> ExprFuncBuilder: """Set the ordering for a window or aggregate function. This function will create an :py:class:`ExprFuncBuilder` that can be used to set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ - return ExprFuncBuilder(self.expr.order_by(list(e.expr for e in exprs))) + return ExprFuncBuilder(self.expr.order_by([sort_or_default(e) for e in exprs])) def filter(self, filter: Expr) -> ExprFuncBuilder: """Filter an aggregate function. @@ -522,7 +553,9 @@ def order_by(self, *exprs: Expr) -> ExprFuncBuilder: Values given in ``exprs`` must be sort expressions. You can convert any other expression to a sort expression using `.sort()`. """ - return ExprFuncBuilder(self.builder.order_by(list(e.expr for e in exprs))) + return ExprFuncBuilder( + self.builder.order_by([sort_or_default(e) for e in exprs]) + ) def filter(self, filter: Expr) -> ExprFuncBuilder: """Filter values during aggregation.""" @@ -659,3 +692,27 @@ def end(self) -> Expr: Any non-matching cases will end in a `null` value. """ return Expr(self.case_builder.end()) + + +class SortExpr: + """Used to specify sorting on either a DataFrame or function.""" + + def __init__(self, expr: Expr, ascending: bool, nulls_first: bool) -> None: + """This constructor should not be called by the end user.""" + self.raw_sort = expr_internal.SortExpr(expr, ascending, nulls_first) + + def expr(self) -> Expr: + """Return the raw expr backing the SortExpr.""" + return Expr(self.raw_sort.expr()) + + def ascending(self) -> bool: + """Return ascending property.""" + return self.raw_sort.ascending() + + def nulls_first(self) -> bool: + """Return nulls_first property.""" + return self.raw_sort.nulls_first() + + def __repr__(self) -> str: + """Generate a string representation of this expression.""" + return self.raw_sort.__repr__() diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 163ff04e4..0401afbc4 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,8 +18,15 @@ from __future__ import annotations -from datafusion._internal import functions as f, expr as expr_internal -from datafusion.expr import CaseBuilder, Expr, WindowFrame +from datafusion._internal import functions as f +from datafusion.expr import ( + CaseBuilder, + Expr, + WindowFrame, + SortExpr, + sort_list_to_raw_sort_list, + expr_list_to_raw_expr_list, +) from datafusion.context import SessionContext from datafusion.common import NullTreatment @@ -261,12 +268,6 @@ ] -def expr_list_to_raw_expr_list( - expr_list: Optional[list[Expr]], -) -> Optional[list[expr_internal.Expr]]: - return [e.expr for e in expr_list] if expr_list is not None else None - - def isnan(expr: Expr) -> Expr: """Returns true if a given number is +NaN or -NaN otherwise returns false.""" return Expr(f.isnan(expr.expr)) @@ -352,9 +353,9 @@ def concat_ws(separator: str, *args: Expr) -> Expr: return Expr(f.concat_ws(separator, args)) -def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> Expr: +def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> SortExpr: """Creates a new sort expression.""" - return Expr(f.order_by(expr.expr, ascending, nulls_first)) + return SortExpr(expr.expr, ascending=ascending, nulls_first=nulls_first) def alias(expr: Expr, name: str) -> Expr: @@ -405,7 +406,7 @@ def window( name: str, args: list[Expr], partition_by: list[Expr] | None = None, - order_by: list[Expr] | None = None, + order_by: list[Expr | SortExpr] | None = None, window_frame: WindowFrame | None = None, ctx: SessionContext | None = None, ) -> Expr: @@ -419,9 +420,9 @@ def window( """ args = [a.expr for a in args] partition_by = expr_list_to_raw_expr_list(partition_by) - order_by = expr_list_to_raw_expr_list(order_by) + order_by_raw = sort_list_to_raw_sort_list(order_by) window_frame = window_frame.window_frame if window_frame is not None else None - return Expr(f.window(name, args, partition_by, order_by, window_frame, ctx)) + return Expr(f.window(name, args, partition_by, order_by_raw, window_frame, ctx)) # scalar functions @@ -1608,7 +1609,7 @@ def array_agg( expression: Expr, distinct: bool = False, filter: Optional[Expr] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Aggregate values into an array. @@ -1625,7 +1626,7 @@ def array_agg( filter: If provided, only compute against rows for which the filter is True order_by: Order the resultant array values """ - order_by_raw = expr_list_to_raw_expr_list(order_by) + order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None return Expr( @@ -2107,7 +2108,7 @@ def regr_syy( def first_value( expression: Expr, filter: Optional[Expr] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, ) -> Expr: """Returns the first value in a group of values. @@ -2123,7 +2124,7 @@ def first_value( order_by: Set the ordering of the expression to evaluate null_treatment: Assign whether to respect or ignull null values. """ - order_by_raw = expr_list_to_raw_expr_list(order_by) + order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None return Expr( @@ -2139,7 +2140,7 @@ def first_value( def last_value( expression: Expr, filter: Optional[Expr] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, ) -> Expr: """Returns the last value in a group of values. @@ -2155,7 +2156,7 @@ def last_value( order_by: Set the ordering of the expression to evaluate null_treatment: Assign whether to respect or ignull null values. """ - order_by_raw = expr_list_to_raw_expr_list(order_by) + order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None return Expr( @@ -2172,7 +2173,7 @@ def nth_value( expression: Expr, n: int, filter: Optional[Expr] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, ) -> Expr: """Returns the n-th value in a group of values. @@ -2189,7 +2190,7 @@ def nth_value( order_by: Set the ordering of the expression to evaluate null_treatment: Assign whether to respect or ignull null values. """ - order_by_raw = expr_list_to_raw_expr_list(order_by) + order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None return Expr( @@ -2293,7 +2294,7 @@ def lead( shift_offset: int = 1, default_value: Optional[Any] = None, partition_by: Optional[list[Expr]] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Create a lead window function. @@ -2330,7 +2331,7 @@ def lead( partition_cols = ( [col.expr for col in partition_by] if partition_by is not None else None ) - order_cols = [col.expr for col in order_by] if order_by is not None else None + order_by_raw = sort_list_to_raw_sort_list(order_by) return Expr( f.lead( @@ -2338,7 +2339,7 @@ def lead( shift_offset, default_value, partition_by=partition_cols, - order_by=order_cols, + order_by=order_by_raw, ) ) @@ -2348,7 +2349,7 @@ def lag( shift_offset: int = 1, default_value: Optional[Any] = None, partition_by: Optional[list[Expr]] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Create a lag window function. @@ -2382,7 +2383,7 @@ def lag( partition_cols = ( [col.expr for col in partition_by] if partition_by is not None else None ) - order_cols = [col.expr for col in order_by] if order_by is not None else None + order_by_raw = sort_list_to_raw_sort_list(order_by) return Expr( f.lag( @@ -2390,14 +2391,14 @@ def lag( shift_offset, default_value, partition_by=partition_cols, - order_by=order_cols, + order_by=order_by_raw, ) ) def row_number( partition_by: Optional[list[Expr]] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Create a row number window function. @@ -2421,19 +2422,19 @@ def row_number( partition_cols = ( [col.expr for col in partition_by] if partition_by is not None else None ) - order_cols = [col.expr for col in order_by] if order_by is not None else None + order_by_raw = sort_list_to_raw_sort_list(order_by) return Expr( f.row_number( partition_by=partition_cols, - order_by=order_cols, + order_by=order_by_raw, ) ) def rank( partition_by: Optional[list[Expr]] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Create a rank window function. @@ -2462,19 +2463,19 @@ def rank( partition_cols = ( [col.expr for col in partition_by] if partition_by is not None else None ) - order_cols = [col.expr for col in order_by] if order_by is not None else None + order_by_raw = sort_list_to_raw_sort_list(order_by) return Expr( f.rank( partition_by=partition_cols, - order_by=order_cols, + order_by=order_by_raw, ) ) def dense_rank( partition_by: Optional[list[Expr]] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Create a dense_rank window function. @@ -2498,19 +2499,19 @@ def dense_rank( partition_cols = ( [col.expr for col in partition_by] if partition_by is not None else None ) - order_cols = [col.expr for col in order_by] if order_by is not None else None + order_by_raw = sort_list_to_raw_sort_list(order_by) return Expr( f.dense_rank( partition_by=partition_cols, - order_by=order_cols, + order_by=order_by_raw, ) ) def percent_rank( partition_by: Optional[list[Expr]] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Create a percent_rank window function. @@ -2535,19 +2536,19 @@ def percent_rank( partition_cols = ( [col.expr for col in partition_by] if partition_by is not None else None ) - order_cols = [col.expr for col in order_by] if order_by is not None else None + order_by_raw = sort_list_to_raw_sort_list(order_by) return Expr( f.percent_rank( partition_by=partition_cols, - order_by=order_cols, + order_by=order_by_raw, ) ) def cume_dist( partition_by: Optional[list[Expr]] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Create a cumulative distribution window function. @@ -2572,12 +2573,12 @@ def cume_dist( partition_cols = ( [col.expr for col in partition_by] if partition_by is not None else None ) - order_cols = [col.expr for col in order_by] if order_by is not None else None + order_by_raw = sort_list_to_raw_sort_list(order_by) return Expr( f.cume_dist( partition_by=partition_cols, - order_by=order_cols, + order_by=order_by_raw, ) ) @@ -2585,7 +2586,7 @@ def cume_dist( def ntile( groups: int, partition_by: Optional[list[Expr]] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Create a n-tile window function. @@ -2613,13 +2614,13 @@ def ntile( partition_cols = ( [col.expr for col in partition_by] if partition_by is not None else None ) - order_cols = [col.expr for col in order_by] if order_by is not None else None + order_by_raw = sort_list_to_raw_sort_list(order_by) return Expr( f.ntile( Expr.literal(groups).expr, partition_by=partition_cols, - order_by=order_cols, + order_by=order_by_raw, ) ) @@ -2628,7 +2629,7 @@ def string_agg( expression: Expr, delimiter: str, filter: Optional[Expr] = None, - order_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, ) -> Expr: """Concatenates the input strings. @@ -2645,7 +2646,7 @@ def string_agg( filter: If provided, only compute against rows for which the filter is True order_by: Set the ordering of the expression to evaluate """ - order_by_raw = expr_list_to_raw_expr_list(order_by) + order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None return Expr( diff --git a/python/datafusion/tests/test_expr.py b/python/datafusion/tests/test_expr.py index 056d2ea03..b58177f16 100644 --- a/python/datafusion/tests/test_expr.py +++ b/python/datafusion/tests/test_expr.py @@ -15,18 +15,21 @@ # specific language governing permissions and limitations # under the License. +import pyarrow +import pytest from datafusion import SessionContext, col -from datafusion.expr import Column, Literal, BinaryExpr, AggregateFunction from datafusion.expr import ( - Projection, - Filter, Aggregate, + AggregateFunction, + BinaryExpr, + Column, + Filter, Limit, + Literal, + Projection, Sort, TableScan, ) -import pyarrow -import pytest @pytest.fixture @@ -192,3 +195,24 @@ def test_expr_getitem() -> None: assert names == ["Alice", "Bob", "Charlie", None] assert array_values == [2, 5, None, None] + + +def test_display_name_deprecation(): + import warnings + + expr = col("foo") + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered + warnings.simplefilter("always") + + # should trigger warning + name = expr.display_name() + + # Verify some things + assert len(w) == 1 + assert issubclass(w[-1].category, DeprecationWarning) + assert "deprecated" in str(w[-1].message) + + # returns appropriate result + assert name == expr.schema_name() + assert name == "foo" diff --git a/python/datafusion/tests/test_sql.py b/python/datafusion/tests/test_sql.py index e41d01004..cbb2e9f57 100644 --- a/python/datafusion/tests/test_sql.py +++ b/python/datafusion/tests/test_sql.py @@ -264,14 +264,17 @@ def test_execute(ctx, tmp_path): # count result = ctx.sql("SELECT COUNT(a) AS cnt FROM t WHERE a IS NOT NULL").collect() + ctx.sql("SELECT COUNT(a) AS cnt FROM t WHERE a IS NOT NULL").show() + + expected_schema = pa.schema([("cnt", pa.int64(), False)]) + expected_values = pa.array([7], type=pa.int64()) + expected = [pa.RecordBatch.from_arrays([expected_values], schema=expected_schema)] - expected = pa.array([7], pa.int64()) - expected = [pa.RecordBatch.from_arrays([expected], ["cnt"])] assert result == expected # where - expected = pa.array([2], pa.int64()) - expected = [pa.RecordBatch.from_arrays([expected], ["cnt"])] + expected_values = pa.array([2], type=pa.int64()) + expected = [pa.RecordBatch.from_arrays([expected_values], schema=expected_schema)] result = ctx.sql("SELECT COUNT(a) AS cnt FROM t WHERE a > 10").collect() assert result == expected diff --git a/python/datafusion/tests/test_wrapper_coverage.py b/python/datafusion/tests/test_wrapper_coverage.py index 4a47de2e1..c53a89c59 100644 --- a/python/datafusion/tests/test_wrapper_coverage.py +++ b/python/datafusion/tests/test_wrapper_coverage.py @@ -39,7 +39,10 @@ def missing_exports(internal_obj, wrapped_obj) -> None: internal_attr = getattr(internal_obj, attr) wrapped_attr = getattr(wrapped_obj, attr) - assert wrapped_attr is not None if internal_attr is not None else True + if internal_attr is not None: + if wrapped_attr is None: + print("Missing attribute: ", attr) + assert False if attr in ["__self__", "__class__"]: continue diff --git a/src/common/data_type.rs b/src/common/data_type.rs index a29d1799c..7f9c75bfd 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -24,7 +24,7 @@ use pyo3::{exceptions::PyValueError, prelude::*}; use crate::errors::py_datafusion_err; #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "RexType", module = "datafusion.common")] +#[pyclass(eq, eq_int, name = "RexType", module = "datafusion.common")] pub enum RexType { Alias, Literal, @@ -692,7 +692,7 @@ impl From for PyDataType { /// Represents the possible Python types that can be mapped to the SQL types #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "PythonType", module = "datafusion.common")] +#[pyclass(eq, eq_int, name = "PythonType", module = "datafusion.common")] pub enum PythonType { Array, Bool, @@ -712,7 +712,7 @@ pub enum PythonType { #[allow(non_camel_case_types)] #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "SqlType", module = "datafusion.common")] +#[pyclass(eq, eq_int, name = "SqlType", module = "datafusion.common")] pub enum SqlType { ANY, ARRAY, @@ -770,7 +770,7 @@ pub enum SqlType { #[allow(non_camel_case_types)] #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "NullTreatment", module = "datafusion.common")] +#[pyclass(eq, eq_int, name = "NullTreatment", module = "datafusion.common")] pub enum NullTreatment { IGNORE_NULLS, RESPECT_NULLS, diff --git a/src/common/schema.rs b/src/common/schema.rs index 5806c90e2..66ce925ae 100644 --- a/src/common/schema.rs +++ b/src/common/schema.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; +use std::{any::Any, borrow::Cow}; use datafusion::arrow::datatypes::SchemaRef; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableSource}; @@ -62,6 +62,7 @@ pub struct SqlTable { #[pymethods] impl SqlTable { #[new] + #[pyo3(signature = (table_name, columns, row_count, filepaths=None))] pub fn new( table_name: String, columns: Vec<(String, DataTypeMap)>, @@ -163,39 +164,33 @@ impl TableSource for SqlTableSource { self.schema.clone() } - fn supports_filter_pushdown( - &self, - filter: &Expr, - ) -> datafusion::common::Result { - let filters = split_conjunction(filter); - if filters.iter().all(|f| is_supported_push_down_expr(f)) { - // Push down filters to the tablescan operation if all are supported - Ok(TableProviderFilterPushDown::Exact) - } else if filters.iter().any(|f| is_supported_push_down_expr(f)) { - // Partially apply the filter in the TableScan but retain - // the Filter operator in the plan as well - Ok(TableProviderFilterPushDown::Inexact) - } else { - Ok(TableProviderFilterPushDown::Unsupported) - } - } - fn table_type(&self) -> datafusion::logical_expr::TableType { datafusion::logical_expr::TableType::Base } - #[allow(deprecated)] fn supports_filters_pushdown( &self, filters: &[&Expr], ) -> datafusion::common::Result> { filters .iter() - .map(|f| self.supports_filter_pushdown(f)) + .map(|f| { + let filters = split_conjunction(f); + if filters.iter().all(|f| is_supported_push_down_expr(f)) { + // Push down filters to the tablescan operation if all are supported + Ok(TableProviderFilterPushDown::Exact) + } else if filters.iter().any(|f| is_supported_push_down_expr(f)) { + // Partially apply the filter in the TableScan but retain + // the Filter operator in the plan as well + Ok(TableProviderFilterPushDown::Inexact) + } else { + Ok(TableProviderFilterPushDown::Unsupported) + } + }) .collect() } - fn get_logical_plan(&self) -> Option<&datafusion::logical_expr::LogicalPlan> { + fn get_logical_plan(&self) -> Option> { None } } diff --git a/src/context.rs b/src/context.rs index 11b9fed5f..79db2e65c 100644 --- a/src/context.rs +++ b/src/context.rs @@ -35,7 +35,7 @@ use crate::catalog::{PyCatalog, PyTable}; use crate::dataframe::PyDataFrame; use crate::dataset::Dataset; use crate::errors::{py_datafusion_err, DataFusionError}; -use crate::expr::PyExpr; +use crate::expr::sort_expr::PySortExpr; use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; @@ -86,7 +86,7 @@ impl PySessionConfig { let mut config = SessionConfig::new(); if let Some(hash_map) = config_options { for (k, v) in &hash_map { - config = config.set(k, ScalarValue::Utf8(Some(v.clone()))); + config = config.set(k, &ScalarValue::Utf8(Some(v.clone()))); } } @@ -294,6 +294,7 @@ impl PySessionContext { } /// Register an object store with the given name + #[pyo3(signature = (scheme, store, host=None))] pub fn register_object_store( &mut self, scheme: &str, @@ -332,7 +333,7 @@ impl PySessionContext { table_partition_cols: Vec<(String, String)>, file_extension: &str, schema: Option>, - file_sort_order: Option>>, + file_sort_order: Option>>, py: Python, ) -> PyResult<()> { let options = ListingOptions::new(Arc::new(ParquetFormat::new())) @@ -374,6 +375,7 @@ impl PySessionContext { Ok(PyDataFrame::new(df)) } + #[pyo3(signature = (query, options=None))] pub fn sql_with_options( &mut self, query: &str, @@ -390,6 +392,7 @@ impl PySessionContext { Ok(PyDataFrame::new(df)) } + #[pyo3(signature = (partitions, name=None, schema=None))] pub fn create_dataframe( &mut self, partitions: PyArrowType>>, @@ -433,6 +436,7 @@ impl PySessionContext { } /// Construct datafusion dataframe from Python list + #[pyo3(signature = (data, name=None))] pub fn from_pylist( &mut self, data: Bound<'_, PyList>, @@ -452,6 +456,7 @@ impl PySessionContext { } /// Construct datafusion dataframe from Python dictionary + #[pyo3(signature = (data, name=None))] pub fn from_pydict( &mut self, data: Bound<'_, PyDict>, @@ -471,6 +476,7 @@ impl PySessionContext { } /// Construct datafusion dataframe from Arrow Table + #[pyo3(signature = (data, name=None))] pub fn from_arrow( &mut self, data: Bound<'_, PyAny>, @@ -506,6 +512,7 @@ impl PySessionContext { /// Construct datafusion dataframe from pandas #[allow(clippy::wrong_self_convention)] + #[pyo3(signature = (data, name=None))] pub fn from_pandas( &mut self, data: Bound<'_, PyAny>, @@ -525,6 +532,7 @@ impl PySessionContext { } /// Construct datafusion dataframe from polars + #[pyo3(signature = (data, name=None))] pub fn from_polars( &mut self, data: Bound<'_, PyAny>, @@ -581,7 +589,7 @@ impl PySessionContext { file_extension: &str, skip_metadata: bool, schema: Option>, - file_sort_order: Option>>, + file_sort_order: Option>>, py: Python, ) -> PyResult<()> { let mut options = ParquetReadOptions::default() @@ -882,7 +890,7 @@ impl PySessionContext { file_extension: &str, skip_metadata: bool, schema: Option>, - file_sort_order: Option>>, + file_sort_order: Option>>, py: Python, ) -> PyResult { let mut options = ParquetReadOptions::default() diff --git a/src/dataframe.rs b/src/dataframe.rs index f33622cc0..1f7f2e643 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -36,16 +36,19 @@ use datafusion::prelude::*; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; -use pyo3::types::{PyCapsule, PyTuple}; +use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; use tokio::task::JoinHandle; use crate::errors::py_datafusion_err; -use crate::expr::to_sort_expressions; +use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; use crate::utils::{get_tokio_runtime, wait_for_future}; -use crate::{errors::DataFusionError, expr::PyExpr}; +use crate::{ + errors::DataFusionError, + expr::{sort_expr::PySortExpr, PyExpr}, +}; /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -70,7 +73,7 @@ impl PyDataFrame { if let Ok(key) = key.extract::() { // df[col] self.select_columns(vec![key]) - } else if let Ok(tuple) = key.extract::<&PyTuple>() { + } else if let Ok(tuple) = key.downcast::() { // df[col1, col2, col3] let keys = tuple .iter() @@ -196,7 +199,7 @@ impl PyDataFrame { } #[pyo3(signature = (*exprs))] - fn sort(&self, exprs: Vec) -> PyResult { + fn sort(&self, exprs: Vec) -> PyResult { let exprs = to_sort_expressions(exprs); let df = self.df.as_ref().clone().sort(exprs)?; Ok(Self::new(df)) @@ -504,6 +507,7 @@ impl PyDataFrame { Ok(table) } + #[pyo3(signature = (requested_schema=None))] fn __arrow_c_stream__<'py>( &'py mut self, py: Python<'py>, diff --git a/src/dataset.rs b/src/dataset.rs index de7402fd6..a8fa21ec5 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -39,7 +39,7 @@ use crate::dataset_exec::DatasetExec; use crate::pyarrow_filter_expression::PyArrowFilterExpression; // Wraps a pyarrow.dataset.Dataset class and implements a Datafusion TableProvider around it -#[derive(Debug, Clone)] +#[derive(Debug)] pub(crate) struct Dataset { dataset: PyObject, } diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index a377e2555..2759aa678 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -53,7 +53,7 @@ impl Iterator for PyArrowBatchesAdapter { fn next(&mut self) -> Option { Python::with_gil(|py| { - let mut batches = self.batches.clone().into_bound(py); + let mut batches = self.batches.clone_ref(py).into_bound(py); Some( batches .next()? @@ -65,7 +65,7 @@ impl Iterator for PyArrowBatchesAdapter { } // Wraps a pyarrow.dataset.Dataset class and implements a Datafusion ExecutionPlan around it -#[derive(Debug, Clone)] +#[derive(Debug)] pub(crate) struct DatasetExec { dataset: PyObject, schema: SchemaRef, diff --git a/src/expr.rs b/src/expr.rs index 0e1a193f3..304d147c9 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -28,7 +28,7 @@ use datafusion::arrow::pyarrow::PyArrowType; use datafusion::functions::core::expr_ext::FieldAccessor; use datafusion::logical_expr::{ col, - expr::{AggregateFunction, InList, InSubquery, ScalarFunction, Sort, WindowFunction}, + expr::{AggregateFunction, InList, InSubquery, ScalarFunction, WindowFunction}, lit, Between, BinaryExpr, Case, Cast, Expr, Like, Operator, TryCast, }; use datafusion::scalar::ScalarValue; @@ -94,6 +94,8 @@ pub mod unnest; pub mod unnest_expr; pub mod window; +use sort_expr::{to_sort_expressions, PySortExpr}; + /// A PyExpr that can be used on a DataFrame #[pyclass(name = "Expr", module = "datafusion.expr", subclass)] #[derive(Debug, Clone)] @@ -150,7 +152,6 @@ impl PyExpr { Expr::Case(value) => Ok(case::PyCase::from(value.clone()).into_py(py)), Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_py(py)), Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_py(py)), - Expr::Sort(value) => Ok(sort_expr::PySortExpr::from(value.clone()).into_py(py)), Expr::ScalarFunction(value) => Err(py_unsupported_variant_err(format!( "Converting Expr::ScalarFunction to a Python object is not implemented: {:?}", value @@ -167,9 +168,9 @@ impl PyExpr { Expr::ScalarSubquery(value) => { Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_py(py)) } - Expr::Wildcard { qualifier } => Err(py_unsupported_variant_err(format!( - "Converting Expr::Wildcard to a Python object is not implemented : {:?}", - qualifier + Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( + "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", + qualifier, options ))), Expr::GroupingSet(value) => { Ok(grouping_set::PyGroupingSet::from(value.clone()).into_py(py)) @@ -188,13 +189,13 @@ impl PyExpr { /// Returns the name of this expression as it should appear in a schema. This name /// will not include any CAST expressions. - fn display_name(&self) -> PyResult { - Ok(self.expr.display_name()?) + fn schema_name(&self) -> PyResult { + Ok(format!("{}", self.expr.schema_name())) } /// Returns a full and complete string representation of this expression. fn canonical_name(&self) -> PyResult { - Ok(self.expr.canonical_name()) + Ok(format!("{}", self.expr)) } /// Returns the name of the Expr variant. @@ -274,7 +275,7 @@ impl PyExpr { /// Create a sort PyExpr from an existing PyExpr. #[pyo3(signature = (ascending=true, nulls_first=true))] - pub fn sort(&self, ascending: bool, nulls_first: bool) -> PyExpr { + pub fn sort(&self, ascending: bool, nulls_first: bool) -> PySortExpr { self.expr.clone().sort(ascending, nulls_first).into() } @@ -323,7 +324,6 @@ impl PyExpr { | Expr::Case { .. } | Expr::Cast { .. } | Expr::TryCast { .. } - | Expr::Sort { .. } | Expr::ScalarFunction { .. } | Expr::AggregateFunction { .. } | Expr::WindowFunction { .. } @@ -387,7 +387,6 @@ impl PyExpr { | Expr::Negative(expr) | Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) - | Expr::Sort(Sort { expr, .. }) | Expr::InSubquery(InSubquery { expr, .. }) => Ok(vec![PyExpr::from(*expr.clone())]), // Expr variants containing a collection of Expr(s) for operands @@ -529,7 +528,7 @@ impl PyExpr { // Expression Function Builder functions - pub fn order_by(&self, order_by: Vec) -> PyExprFuncBuilder { + pub fn order_by(&self, order_by: Vec) -> PyExprFuncBuilder { self.expr .clone() .order_by(to_sort_expressions(order_by)) @@ -573,20 +572,9 @@ impl From for PyExprFuncBuilder { } } -pub fn to_sort_expressions(order_by: Vec) -> Vec { - order_by - .iter() - .map(|e| e.expr.clone()) - .map(|e| match e { - Expr::Sort(_) => e, - _ => e.sort(true, true), - }) - .collect() -} - #[pymethods] impl PyExprFuncBuilder { - pub fn order_by(&self, order_by: Vec) -> PyExprFuncBuilder { + pub fn order_by(&self, order_by: Vec) -> PyExprFuncBuilder { self.builder .clone() .order_by(to_sort_expressions(order_by)) @@ -641,11 +629,6 @@ impl PyExpr { input_plan: &LogicalPlan, ) -> Result, DataFusionError> { match expr { - Expr::Sort(Sort { expr, .. }) => { - // DataFusion does not support create_name for sort expressions (since they never - // appear in projections) so we just delegate to the contained expression instead - Self::expr_to_field(expr, input_plan) - } Expr::Wildcard { .. } => { // Since * could be any of the valid column names just return the first one Ok(Arc::new(input_plan.schema().field(0).clone())) diff --git a/src/expr/sort.rs b/src/expr/sort.rs index b31ebfe0b..a1803ccaf 100644 --- a/src/expr/sort.rs +++ b/src/expr/sort.rs @@ -22,7 +22,7 @@ use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; use crate::expr::logical_node::LogicalNode; -use crate::expr::PyExpr; +use crate::expr::sort_expr::PySortExpr; use crate::sql::logical::PyLogicalPlan; #[pyclass(name = "Sort", module = "datafusion.expr", subclass)] @@ -63,12 +63,12 @@ impl Display for PySort { #[pymethods] impl PySort { /// Retrieves the sort expressions for this `Sort` - fn sort_exprs(&self) -> PyResult> { + fn sort_exprs(&self) -> PyResult> { Ok(self .sort .expr .iter() - .map(|e| PyExpr::from(e.clone())) + .map(|e| PySortExpr::from(e.clone())) .collect()) } diff --git a/src/expr/sort_expr.rs b/src/expr/sort_expr.rs index 4299d1f71..12f74e4d8 100644 --- a/src/expr/sort_expr.rs +++ b/src/expr/sort_expr.rs @@ -51,10 +51,29 @@ impl Display for PySortExpr { } } +pub fn to_sort_expressions(order_by: Vec) -> Vec { + order_by.iter().map(|e| e.sort.clone()).collect() +} + +pub fn py_sort_expr_list(expr: &[SortExpr]) -> PyResult> { + Ok(expr.iter().map(|e| PySortExpr::from(e.clone())).collect()) +} + #[pymethods] impl PySortExpr { + #[new] + fn new(expr: PyExpr, asc: bool, nulls_first: bool) -> Self { + Self { + sort: SortExpr { + expr: expr.into(), + asc, + nulls_first, + }, + } + } + fn expr(&self) -> PyResult { - Ok((*self.sort.expr).clone().into()) + Ok(self.sort.expr.clone().into()) } fn ascending(&self) -> PyResult { diff --git a/src/expr/window.rs b/src/expr/window.rs index f17a6dd9b..950db12ae 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -24,6 +24,7 @@ use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; use crate::errors::py_type_err; use crate::expr::logical_node::LogicalNode; +use crate::expr::sort_expr::{py_sort_expr_list, PySortExpr}; use crate::expr::PyExpr; use crate::sql::logical::PyLogicalPlan; @@ -114,9 +115,9 @@ impl PyWindow { } /// Returns order by columns in a window function expression - pub fn get_sort_exprs(&self, expr: PyExpr) -> PyResult> { + pub fn get_sort_exprs(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { order_by, .. }) => py_expr_list(&order_by), + Expr::WindowFunction(WindowFunction { order_by, .. }) => py_sort_expr_list(&order_by), other => Err(not_window_function_err(other)), } } diff --git a/src/functions.rs b/src/functions.rs index b9ca6301a..32f6519f7 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -25,7 +25,8 @@ use crate::common::data_type::NullTreatment; use crate::context::PySessionContext; use crate::errors::DataFusionError; use crate::expr::conditional_expr::PyCaseBuilder; -use crate::expr::to_sort_expressions; +use crate::expr::sort_expr::to_sort_expressions; +use crate::expr::sort_expr::PySortExpr; use crate::expr::window::PyWindowFrame; use crate::expr::PyExpr; use datafusion::common::{Column, ScalarValue, TableReference}; @@ -35,7 +36,7 @@ use datafusion::functions_aggregate; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::sqlparser::ast::NullTreatment as DFNullTreatment; use datafusion::logical_expr::{ - expr::{find_df_window_func, Sort, WindowFunction}, + expr::{find_df_window_func, WindowFunction}, lit, Expr, WindowFunctionDefinition, }; @@ -43,7 +44,7 @@ fn add_builder_fns_to_aggregate( agg_fn: Expr, distinct: Option, filter: Option, - order_by: Option>, + order_by: Option>, null_treatment: Option, ) -> PyResult { // Since ExprFuncBuilder::new() is private, we can guarantee initializing @@ -96,6 +97,7 @@ fn array_cat(exprs: Vec) -> PyExpr { } #[pyfunction] +#[pyo3(signature = (array, element, index=None))] fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { let index = ScalarValue::Int64(index); let index = Expr::Literal(index); @@ -104,6 +106,7 @@ fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr } #[pyfunction] +#[pyo3(signature = (array, begin, end, stride=None))] fn array_slice(array: PyExpr, begin: PyExpr, end: PyExpr, stride: Option) -> PyExpr { datafusion::functions_nested::expr_fn::array_slice( array.into(), @@ -142,16 +145,19 @@ fn concat_ws(sep: String, args: Vec) -> PyResult { } #[pyfunction] +#[pyo3(signature = (values, regex, flags=None))] fn regexp_like(values: PyExpr, regex: PyExpr, flags: Option) -> PyResult { Ok(functions::expr_fn::regexp_like(values.expr, regex.expr, flags.map(|x| x.expr)).into()) } #[pyfunction] +#[pyo3(signature = (values, regex, flags=None))] fn regexp_match(values: PyExpr, regex: PyExpr, flags: Option) -> PyResult { Ok(functions::expr_fn::regexp_match(values.expr, regex.expr, flags.map(|x| x.expr)).into()) } #[pyfunction] +#[pyo3(signature = (string, pattern, replacement, flags=None))] /// Replaces substring(s) matching a POSIX regular expression. fn regexp_replace( string: PyExpr, @@ -169,14 +175,12 @@ fn regexp_replace( } /// Creates a new Sort Expr #[pyfunction] -fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { - Ok(PyExpr { - expr: datafusion::logical_expr::Expr::Sort(Sort { - expr: Box::new(expr.expr), - asc, - nulls_first, - }), - }) +fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { + Ok(PySortExpr::from(datafusion::logical_expr::expr::Sort { + expr: expr.expr, + asc, + nulls_first, + })) } /// Creates a new Alias Expr @@ -283,11 +287,12 @@ fn find_window_fn(name: &str, ctx: Option) -> PyResult, partition_by: Option>, - order_by: Option>, + order_by: Option>, window_frame: Option, ctx: Option, ) -> PyResult { @@ -309,11 +314,7 @@ fn window( order_by: order_by .unwrap_or_default() .into_iter() - .map(|x| x.expr) - .map(|e| match e { - Expr::Sort(_) => e, - _ => e.sort(true, true), - }) + .map(|x| x.into()) .collect::>(), window_frame, null_treatment: None, @@ -331,11 +332,12 @@ macro_rules! aggregate_function { }; ($NAME: ident, $($arg:ident)*) => { #[pyfunction] + #[pyo3(signature = ($($arg),*, distinct=None, filter=None, order_by=None, null_treatment=None))] fn $NAME( $($arg: PyExpr),*, distinct: Option, filter: Option, - order_by: Option>, + order_by: Option>, null_treatment: Option ) -> PyResult { let agg_fn = functions_aggregate::expr_fn::$NAME($($arg.into()),*); @@ -351,11 +353,12 @@ macro_rules! aggregate_function_vec_args { }; ($NAME: ident, $($arg:ident)*) => { #[pyfunction] + #[pyo3(signature = ($($arg),*, distinct=None, filter=None, order_by=None, null_treatment=None))] fn $NAME( $($arg: PyExpr),*, distinct: Option, filter: Option, - order_by: Option>, + order_by: Option>, null_treatment: Option ) -> PyResult { let agg_fn = functions_aggregate::expr_fn::$NAME(vec![$($arg.into()),*]); @@ -624,6 +627,7 @@ aggregate_function!(approx_median); // aggregate_function!(grouping); #[pyfunction] +#[pyo3(signature = (expression, percentile, num_centroids=None, filter=None))] pub fn approx_percentile_cont( expression: PyExpr, percentile: f64, @@ -642,6 +646,7 @@ pub fn approx_percentile_cont( } #[pyfunction] +#[pyo3(signature = (expression, weight, percentile, filter=None))] pub fn approx_percentile_cont_with_weight( expression: PyExpr, weight: PyExpr, @@ -662,11 +667,12 @@ aggregate_function_vec_args!(last_value); // We handle first_value explicitly because the signature expects an order_by // https://github.com/apache/datafusion/issues/12376 #[pyfunction] +#[pyo3(signature = (expr, distinct=None, filter=None, order_by=None, null_treatment=None))] pub fn first_value( expr: PyExpr, distinct: Option, filter: Option, - order_by: Option>, + order_by: Option>, null_treatment: Option, ) -> PyResult { // If we initialize the UDAF with order_by directly, then it gets over-written by the builder @@ -677,26 +683,28 @@ pub fn first_value( // nth_value requires a non-expr argument #[pyfunction] +#[pyo3(signature = (expr, n, distinct=None, filter=None, order_by=None, null_treatment=None))] pub fn nth_value( expr: PyExpr, n: i64, distinct: Option, filter: Option, - order_by: Option>, + order_by: Option>, null_treatment: Option, ) -> PyResult { - let agg_fn = datafusion::functions_aggregate::nth_value::nth_value(vec![expr.expr, lit(n)]); + let agg_fn = datafusion::functions_aggregate::nth_value::nth_value(expr.expr, n, vec![]); add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) } // string_agg requires a non-expr argument #[pyfunction] +#[pyo3(signature = (expr, delimiter, distinct=None, filter=None, order_by=None, null_treatment=None))] pub fn string_agg( expr: PyExpr, delimiter: String, distinct: Option, filter: Option, - order_by: Option>, + order_by: Option>, null_treatment: Option, ) -> PyResult { let agg_fn = datafusion::functions_aggregate::string_agg::string_agg(expr.expr, lit(delimiter)); @@ -706,7 +714,7 @@ pub fn string_agg( fn add_builder_fns_to_window( window_fn: Expr, partition_by: Option>, - order_by: Option>, + order_by: Option>, ) -> PyResult { // Since ExprFuncBuilder::new() is private, set an empty partition and then // override later if appropriate. @@ -730,12 +738,13 @@ fn add_builder_fns_to_window( } #[pyfunction] +#[pyo3(signature = (arg, shift_offset, default_value=None, partition_by=None, order_by=None))] pub fn lead( arg: PyExpr, shift_offset: i64, default_value: Option, partition_by: Option>, - order_by: Option>, + order_by: Option>, ) -> PyResult { let window_fn = window_function::lead(arg.expr, Some(shift_offset), default_value); @@ -743,12 +752,13 @@ pub fn lead( } #[pyfunction] +#[pyo3(signature = (arg, shift_offset, default_value=None, partition_by=None, order_by=None))] pub fn lag( arg: PyExpr, shift_offset: i64, default_value: Option, partition_by: Option>, - order_by: Option>, + order_by: Option>, ) -> PyResult { let window_fn = window_function::lag(arg.expr, Some(shift_offset), default_value); @@ -756,26 +766,32 @@ pub fn lag( } #[pyfunction] +#[pyo3(signature = (partition_by=None, order_by=None))] pub fn row_number( partition_by: Option>, - order_by: Option>, + order_by: Option>, ) -> PyResult { - let window_fn = window_function::row_number(); + let window_fn = datafusion::functions_window::expr_fn::row_number(); add_builder_fns_to_window(window_fn, partition_by, order_by) } #[pyfunction] -pub fn rank(partition_by: Option>, order_by: Option>) -> PyResult { +#[pyo3(signature = (partition_by=None, order_by=None))] +pub fn rank( + partition_by: Option>, + order_by: Option>, +) -> PyResult { let window_fn = window_function::rank(); add_builder_fns_to_window(window_fn, partition_by, order_by) } #[pyfunction] +#[pyo3(signature = (partition_by=None, order_by=None))] pub fn dense_rank( partition_by: Option>, - order_by: Option>, + order_by: Option>, ) -> PyResult { let window_fn = window_function::dense_rank(); @@ -783,9 +799,10 @@ pub fn dense_rank( } #[pyfunction] +#[pyo3(signature = (partition_by=None, order_by=None))] pub fn percent_rank( partition_by: Option>, - order_by: Option>, + order_by: Option>, ) -> PyResult { let window_fn = window_function::percent_rank(); @@ -793,9 +810,10 @@ pub fn percent_rank( } #[pyfunction] +#[pyo3(signature = (partition_by=None, order_by=None))] pub fn cume_dist( partition_by: Option>, - order_by: Option>, + order_by: Option>, ) -> PyResult { let window_fn = window_function::cume_dist(); @@ -803,10 +821,11 @@ pub fn cume_dist( } #[pyfunction] +#[pyo3(signature = (arg, partition_by=None, order_by=None))] pub fn ntile( arg: PyExpr, partition_by: Option>, - order_by: Option>, + order_by: Option>, ) -> PyResult { let window_fn = window_function::ntile(arg.into()); @@ -1002,8 +1021,8 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { // Window Functions m.add_wrapped(wrap_pyfunction!(lead))?; m.add_wrapped(wrap_pyfunction!(lag))?; - m.add_wrapped(wrap_pyfunction!(row_number))?; m.add_wrapped(wrap_pyfunction!(rank))?; + m.add_wrapped(wrap_pyfunction!(row_number))?; m.add_wrapped(wrap_pyfunction!(dense_rank))?; m.add_wrapped(wrap_pyfunction!(percent_rank))?; m.add_wrapped(wrap_pyfunction!(cume_dist))?; diff --git a/src/pyarrow_filter_expression.rs b/src/pyarrow_filter_expression.rs index 6e2a45e1a..0f97ea442 100644 --- a/src/pyarrow_filter_expression.rs +++ b/src/pyarrow_filter_expression.rs @@ -27,7 +27,7 @@ use datafusion::logical_expr::{expr::InList, Between, BinaryExpr, Expr, Operator use crate::errors::DataFusionError; -#[derive(Debug, Clone)] +#[derive(Debug)] #[repr(transparent)] pub(crate) struct PyArrowFilterExpression(PyObject); diff --git a/src/udf.rs b/src/udf.rs index 8bd9021d4..7d5db2f96 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -21,7 +21,8 @@ use pyo3::{prelude::*, types::PyTuple}; use datafusion::arrow::array::{make_array, Array, ArrayData, ArrayRef}; use datafusion::arrow::datatypes::DataType; -use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow}; +use datafusion::arrow::pyarrow::FromPyArrow; +use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::error::DataFusionError; use datafusion::logical_expr::create_udf; use datafusion::logical_expr::function::ScalarFunctionImplementation; @@ -43,16 +44,15 @@ fn to_rust_function(func: PyObject) -> ScalarFunctionImplementation { .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) .collect::>(); - let py_args = PyTuple::new(py, py_args); + let py_args = PyTuple::new_bound(py, py_args); // 2. call function let value = func - .as_ref(py) - .call(py_args, None) + .call_bound(py, py_args, None) .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; // 3. cast to arrow::array::Array - let array_data = ArrayData::from_pyarrow(value).unwrap(); + let array_data = ArrayData::from_pyarrow_bound(value.bind(py)).unwrap(); Ok(make_array(array_data)) }) }, From a00cfbfdbf3143c8c56d9ea043be3fb69da008ee Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 18 Sep 2024 16:56:43 -0400 Subject: [PATCH 035/248] feat: aggregates as windows (#871) * Add to turn any aggregate function into a window function * Rename Window to WindowExpr so we can define Window to mean a window definition to be reused * Add unit test to cover default frames * Improve error report --- python/datafusion/expr.py | 57 ++++++++++++++++- python/datafusion/tests/test_dataframe.py | 75 ++++++++++++++++------- src/expr.rs | 46 +++++++++++++- src/expr/window.rs | 20 +++--- src/functions.rs | 29 +++++---- src/sql/logical.rs | 4 +- 6 files changed, 183 insertions(+), 48 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index fd5e6f04a..152aa38d3 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -92,7 +92,7 @@ Union = expr_internal.Union Unnest = expr_internal.Unnest UnnestExpr = expr_internal.UnnestExpr -Window = expr_internal.Window +WindowExpr = expr_internal.WindowExpr __all__ = [ "Expr", @@ -154,6 +154,7 @@ "Partitioning", "Repartition", "Window", + "WindowExpr", "WindowFrame", "WindowFrameBound", ] @@ -542,6 +543,36 @@ def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: """ return ExprFuncBuilder(self.expr.window_frame(window_frame.window_frame)) + def over(self, window: Window) -> Expr: + """Turn an aggregate function into a window function. + + This function turns any aggregate function into a window function. With the + exception of ``partition_by``, how each of the parameters is used is determined + by the underlying aggregate function. + + Args: + window: Window definition + """ + partition_by_raw = expr_list_to_raw_expr_list(window._partition_by) + order_by_raw = sort_list_to_raw_sort_list(window._order_by) + window_frame_raw = ( + window._window_frame.window_frame + if window._window_frame is not None + else None + ) + null_treatment_raw = ( + window._null_treatment.value if window._null_treatment is not None else None + ) + + return Expr( + self.expr.over( + partition_by=partition_by_raw, + order_by=order_by_raw, + window_frame=window_frame_raw, + null_treatment=null_treatment_raw, + ) + ) + class ExprFuncBuilder: def __init__(self, builder: expr_internal.ExprFuncBuilder): @@ -584,6 +615,30 @@ def build(self) -> Expr: return Expr(self.builder.build()) +class Window: + """Define reusable window parameters.""" + + def __init__( + self, + partition_by: Optional[list[Expr]] = None, + window_frame: Optional[WindowFrame] = None, + order_by: Optional[list[SortExpr | Expr]] = None, + null_treatment: Optional[NullTreatment] = None, + ) -> None: + """Construct a window definition. + + Args: + partition_by: Partitions for window operation + window_frame: Define the start and end bounds of the window frame + order_by: Set ordering + null_treatment: Indicate how nulls are to be treated + """ + self._partition_by = partition_by + self._window_frame = window_frame + self._order_by = order_by + self._null_treatment = null_treatment + + class WindowFrame: """Defines a window frame for performing window operations.""" diff --git a/python/datafusion/tests/test_dataframe.py b/python/datafusion/tests/test_dataframe.py index 90954d09a..ad7f728b4 100644 --- a/python/datafusion/tests/test_dataframe.py +++ b/python/datafusion/tests/test_dataframe.py @@ -31,6 +31,7 @@ literal, udf, ) +from datafusion.expr import Window @pytest.fixture @@ -386,38 +387,32 @@ def test_distinct(): ), [-1, -1, None, 7, -1, -1, None], ), - # TODO update all aggregate functions as windows once upstream merges https://github.com/apache/datafusion-python/issues/833 - pytest.param( + ( "first_value", - f.window( - "first_value", - [column("a")], - order_by=[f.order_by(column("b"))], - partition_by=[column("c")], + f.first_value(column("a")).over( + Window(partition_by=[column("c")], order_by=[column("b")]) ), [1, 1, 1, 1, 5, 5, 5], ), - pytest.param( + ( "last_value", - f.window("last_value", [column("a")]) - .window_frame(WindowFrame("rows", 0, None)) - .order_by(column("b")) - .partition_by(column("c")) - .build(), + f.last_value(column("a")).over( + Window( + partition_by=[column("c")], + order_by=[column("b")], + window_frame=WindowFrame("rows", None, None), + ) + ), [3, 3, 3, 3, 6, 6, 6], ), - pytest.param( + ( "3rd_value", - f.window( - "nth_value", - [column("b"), literal(3)], - order_by=[f.order_by(column("a"))], - ), + f.nth_value(column("b"), 3).over(Window(order_by=[column("a")])), [None, None, 7, 7, 7, 7, 7], ), - pytest.param( + ( "avg", - f.round(f.window("avg", [column("b")], order_by=[column("a")]), literal(3)), + f.round(f.avg(column("b")).over(Window(order_by=[column("a")])), literal(3)), [7.0, 7.0, 7.0, 7.333, 7.75, 7.75, 8.0], ), ] @@ -473,6 +468,44 @@ def test_invalid_window_frame(units, start_bound, end_bound): WindowFrame(units, start_bound, end_bound) +def test_window_frame_defaults_match_postgres(partitioned_df): + # ref: https://github.com/apache/datafusion-python/issues/688 + + window_frame = WindowFrame("rows", None, None) + + col_a = column("a") + + # Using `f.window` with or without an unbounded window_frame produces the same + # results. These tests are included as a regression check but can be removed when + # f.window() is deprecated in favor of using the .over() approach. + no_frame = f.window("avg", [col_a]).alias("no_frame") + with_frame = f.window("avg", [col_a], window_frame=window_frame).alias("with_frame") + df_1 = partitioned_df.select(col_a, no_frame, with_frame) + + expected = { + "a": [0, 1, 2, 3, 4, 5, 6], + "no_frame": [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], + "with_frame": [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], + } + + assert df_1.sort(col_a).to_pydict() == expected + + # When order is not set, the default frame should be unounded preceeding to + # unbounded following. When order is set, the default frame is unbounded preceeding + # to current row. + no_order = f.avg(col_a).over(Window()).alias("over_no_order") + with_order = f.avg(col_a).over(Window(order_by=[col_a])).alias("over_with_order") + df_2 = partitioned_df.select(col_a, no_order, with_order) + + expected = { + "a": [0, 1, 2, 3, 4, 5, 6], + "over_no_order": [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], + "over_with_order": [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0], + } + + assert df_2.sort(col_a).to_pydict() == expected + + def test_get_dataframe(tmp_path): ctx = SessionContext() diff --git a/src/expr.rs b/src/expr.rs index 304d147c9..49fa4b845 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -16,7 +16,9 @@ // under the License. use datafusion::logical_expr::utils::exprlist_to_fields; -use datafusion::logical_expr::{ExprFuncBuilder, ExprFunctionExt, LogicalPlan}; +use datafusion::logical_expr::{ + ExprFuncBuilder, ExprFunctionExt, LogicalPlan, WindowFunctionDefinition, +}; use pyo3::{basic::CompareOp, prelude::*}; use std::convert::{From, Into}; use std::sync::Arc; @@ -39,6 +41,7 @@ use crate::expr::aggregate_expr::PyAggregateFunction; use crate::expr::binary_expr::PyBinaryExpr; use crate::expr::column::PyColumn; use crate::expr::literal::PyLiteral; +use crate::functions::add_builder_fns_to_window; use crate::sql::logical::PyLogicalPlan; use self::alias::PyAlias; @@ -558,6 +561,45 @@ impl PyExpr { pub fn window_frame(&self, window_frame: PyWindowFrame) -> PyExprFuncBuilder { self.expr.clone().window_frame(window_frame.into()).into() } + + #[pyo3(signature = (partition_by=None, window_frame=None, order_by=None, null_treatment=None))] + pub fn over( + &self, + partition_by: Option>, + window_frame: Option, + order_by: Option>, + null_treatment: Option, + ) -> PyResult { + match &self.expr { + Expr::AggregateFunction(agg_fn) => { + let window_fn = Expr::WindowFunction(WindowFunction::new( + WindowFunctionDefinition::AggregateUDF(agg_fn.func.clone()), + agg_fn.args.clone(), + )); + + add_builder_fns_to_window( + window_fn, + partition_by, + window_frame, + order_by, + null_treatment, + ) + } + Expr::WindowFunction(_) => add_builder_fns_to_window( + self.expr.clone(), + partition_by, + window_frame, + order_by, + null_treatment, + ), + _ => Err( + DataFusionError::ExecutionError(datafusion::error::DataFusionError::Plan( + format!("Using {} with `over` is not allowed. Must use an aggregate or window function.", self.expr.variant_name()), + )) + .into(), + ), + } + } } #[pyclass(name = "ExprFuncBuilder", module = "datafusion.expr", subclass)] @@ -749,7 +791,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; Ok(()) diff --git a/src/expr/window.rs b/src/expr/window.rs index 950db12ae..6486dbb32 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -32,9 +32,9 @@ use super::py_expr_list; use crate::errors::py_datafusion_err; -#[pyclass(name = "Window", module = "datafusion.expr", subclass)] +#[pyclass(name = "WindowExpr", module = "datafusion.expr", subclass)] #[derive(Clone)] -pub struct PyWindow { +pub struct PyWindowExpr { window: Window, } @@ -62,15 +62,15 @@ pub struct PyWindowFrameBound { frame_bound: WindowFrameBound, } -impl From for Window { - fn from(window: PyWindow) -> Window { +impl From for Window { + fn from(window: PyWindowExpr) -> Window { window.window } } -impl From for PyWindow { - fn from(window: Window) -> PyWindow { - PyWindow { window } +impl From for PyWindowExpr { + fn from(window: Window) -> PyWindowExpr { + PyWindowExpr { window } } } @@ -80,7 +80,7 @@ impl From for PyWindowFrameBound { } } -impl Display for PyWindow { +impl Display for PyWindowExpr { fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!( f, @@ -103,7 +103,7 @@ impl Display for PyWindowFrame { } #[pymethods] -impl PyWindow { +impl PyWindowExpr { /// Returns the schema of the Window pub fn schema(&self) -> PyResult { Ok(self.window.schema.as_ref().clone().into()) @@ -283,7 +283,7 @@ impl PyWindowFrameBound { } } -impl LogicalNode for PyWindow { +impl LogicalNode for PyWindowExpr { fn inputs(&self) -> Vec { vec![self.window.input.as_ref().clone().into()] } diff --git a/src/functions.rs b/src/functions.rs index 32f6519f7..6f8dd7ada 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -711,14 +711,15 @@ pub fn string_agg( add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) } -fn add_builder_fns_to_window( +pub(crate) fn add_builder_fns_to_window( window_fn: Expr, partition_by: Option>, + window_frame: Option, order_by: Option>, + null_treatment: Option, ) -> PyResult { - // Since ExprFuncBuilder::new() is private, set an empty partition and then - // override later if appropriate. - let mut builder = window_fn.partition_by(vec![]); + let null_treatment = null_treatment.map(|n| n.into()); + let mut builder = window_fn.null_treatment(null_treatment); if let Some(partition_cols) = partition_by { builder = builder.partition_by( @@ -734,6 +735,10 @@ fn add_builder_fns_to_window( builder = builder.order_by(order_by_cols); } + if let Some(window_frame) = window_frame { + builder = builder.window_frame(window_frame.into()); + } + builder.build().map(|e| e.into()).map_err(|err| err.into()) } @@ -748,7 +753,7 @@ pub fn lead( ) -> PyResult { let window_fn = window_function::lead(arg.expr, Some(shift_offset), default_value); - add_builder_fns_to_window(window_fn, partition_by, order_by) + add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } #[pyfunction] @@ -762,7 +767,7 @@ pub fn lag( ) -> PyResult { let window_fn = window_function::lag(arg.expr, Some(shift_offset), default_value); - add_builder_fns_to_window(window_fn, partition_by, order_by) + add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } #[pyfunction] @@ -773,7 +778,7 @@ pub fn row_number( ) -> PyResult { let window_fn = datafusion::functions_window::expr_fn::row_number(); - add_builder_fns_to_window(window_fn, partition_by, order_by) + add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } #[pyfunction] @@ -784,7 +789,7 @@ pub fn rank( ) -> PyResult { let window_fn = window_function::rank(); - add_builder_fns_to_window(window_fn, partition_by, order_by) + add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } #[pyfunction] @@ -795,7 +800,7 @@ pub fn dense_rank( ) -> PyResult { let window_fn = window_function::dense_rank(); - add_builder_fns_to_window(window_fn, partition_by, order_by) + add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } #[pyfunction] @@ -806,7 +811,7 @@ pub fn percent_rank( ) -> PyResult { let window_fn = window_function::percent_rank(); - add_builder_fns_to_window(window_fn, partition_by, order_by) + add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } #[pyfunction] @@ -817,7 +822,7 @@ pub fn cume_dist( ) -> PyResult { let window_fn = window_function::cume_dist(); - add_builder_fns_to_window(window_fn, partition_by, order_by) + add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } #[pyfunction] @@ -829,7 +834,7 @@ pub fn ntile( ) -> PyResult { let window_fn = window_function::ntile(arg.into()); - add_builder_fns_to_window(window_fn, partition_by, order_by) + add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 89655ab70..d00f0af3f 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -34,7 +34,7 @@ use crate::expr::subquery::PySubquery; use crate::expr::subquery_alias::PySubqueryAlias; use crate::expr::table_scan::PyTableScan; use crate::expr::unnest::PyUnnest; -use crate::expr::window::PyWindow; +use crate::expr::window::PyWindowExpr; use datafusion::logical_expr::LogicalPlan; use pyo3::prelude::*; @@ -80,7 +80,7 @@ impl PyLogicalPlan { LogicalPlan::Subquery(plan) => PySubquery::from(plan.clone()).to_variant(py), LogicalPlan::SubqueryAlias(plan) => PySubqueryAlias::from(plan.clone()).to_variant(py), LogicalPlan::Unnest(plan) => PyUnnest::from(plan.clone()).to_variant(py), - LogicalPlan::Window(plan) => PyWindow::from(plan.clone()).to_variant(py), + LogicalPlan::Window(plan) => PyWindowExpr::from(plan.clone()).to_variant(py), LogicalPlan::Repartition(_) | LogicalPlan::Union(_) | LogicalPlan::Statement(_) From 044bbe2f32a208f3e745971468d8769ca82e5d70 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 21 Sep 2024 10:33:51 -0400 Subject: [PATCH 036/248] Fix regression on register_udaf (#878) * Test no longer hangs, and updated error string to match latest * Add unit tests for registering udf and udaf * Resolve error on registering udaf #874 * remove stale comment * Update unit test text to match in multiple versions of python * Regex for exception that is compatible with python 3.10 and 3.12 --- python/datafusion/tests/test_udaf.py | 47 +++++++++++++++++++++++----- python/datafusion/udf.py | 4 +-- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/python/datafusion/tests/test_udaf.py b/python/datafusion/tests/test_udaf.py index 76488e19b..6f2525b0f 100644 --- a/python/datafusion/tests/test_udaf.py +++ b/python/datafusion/tests/test_udaf.py @@ -21,7 +21,7 @@ import pyarrow.compute as pc import pytest -from datafusion import Accumulator, SessionContext, column, udaf +from datafusion import Accumulator, column, udaf, udf class Summarize(Accumulator): @@ -60,18 +60,15 @@ def state(self) -> List[pa.Scalar]: @pytest.fixture -def df(): - ctx = SessionContext() - +def df(ctx): # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 4, 6])], names=["a", "b"], ) - return ctx.create_dataframe([[batch]]) + return ctx.create_dataframe([[batch]], name="test_table") -@pytest.mark.skip(reason="df.collect() will hang, need more investigations") def test_errors(df): with pytest.raises(TypeError): udaf( @@ -92,8 +89,9 @@ def test_errors(df): df = df.aggregate([], [accum(column("a"))]) msg = ( - "Can't instantiate abstract class MissingMethods with abstract " - "methods evaluate, merge, update" + "Can't instantiate abstract class MissingMethods (without an implementation " + "for abstract methods 'evaluate', 'merge', 'update'|with abstract methods " + "evaluate, merge, update)" ) with pytest.raises(Exception, match=msg): df.collect() @@ -132,3 +130,36 @@ def test_group_by(df): arrays = [batch.column(1) for batch in batches] joined = pa.concat_arrays(arrays) assert joined == pa.array([1.0 + 2.0, 3.0]) + + +def test_register_udaf(ctx, df) -> None: + summarize = udaf( + Summarize, + pa.float64(), + pa.float64(), + [pa.float64()], + volatility="immutable", + ) + + ctx.register_udaf(summarize) + + df_result = ctx.sql("select summarize(b) from test_table") + + assert df_result.collect()[0][0][0].as_py() == 14.0 + + +def test_register_udf(ctx, df) -> None: + is_null = udf( + lambda x: x.is_null(), + [pa.float64()], + pa.bool_(), + volatility="immutable", + name="is_null", + ) + + ctx.register_udf(is_null) + + df_result = ctx.sql("select is_null(a) from test_table") + result = df_result.collect()[0].column(0) + + assert result == pa.array([False, False, False]) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index a3b74bb11..f74d675e3 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -192,7 +192,7 @@ def __init__( See :py:func:`udaf` for a convenience function and argument descriptions. """ - self._udf = df_internal.AggregateUDF( + self._udaf = df_internal.AggregateUDF( name, accumulator, input_types, return_type, state_type, str(volatility) ) @@ -203,7 +203,7 @@ def __call__(self, *args: Expr) -> Expr: occur during the evaluation of the dataframe. """ args = [arg.expr for arg in args] - return Expr(self._udf.__call__(*args)) + return Expr(self._udaf.__call__(*args)) @staticmethod def udaf( From a18ece8bfb5f5814cb58a731c7856c630882a210 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Sun, 22 Sep 2024 12:41:28 -0500 Subject: [PATCH 037/248] build(deps): upgrade setup-protoc action and protoc version number (#873) `arduino/setup-protoc@v3` could not find our old version number of `3.20.2`. I chose protoc version 27.4 because it's the last release from the prior version. --- .github/workflows/build.yml | 8 ++++---- .github/workflows/docs.yaml | 4 ++-- .github/workflows/test.yaml | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5fae13f2e..f52913ce8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -92,9 +92,9 @@ jobs: path: . - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v3 with: - version: "3.20.2" + version: "27.4" repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Build Python package @@ -150,9 +150,9 @@ jobs: path: . - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v3 with: - version: "3.20.2" + version: "27.4" repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Build Python package diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 60dc15927..e47497b2a 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -52,9 +52,9 @@ jobs: python-version: "3.11" - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v3 with: - version: '3.20.2' + version: '27.4' repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Install dependencies diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4f47dc984..f9383db5f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -50,9 +50,9 @@ jobs: override: true - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v3 with: - version: '3.20.2' + version: '27.4' repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Setup Python From f6261b05ce0d68489403fe966a464cdd7d24b3b5 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Sun, 22 Sep 2024 12:42:22 -0500 Subject: [PATCH 038/248] add a test to capture the bug (#843) --- python/datafusion/tests/test_context.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/datafusion/tests/test_context.py b/python/datafusion/tests/test_context.py index 4af00a3b4..ab86faa9d 100644 --- a/python/datafusion/tests/test_context.py +++ b/python/datafusion/tests/test_context.py @@ -372,6 +372,25 @@ def test_dataset_filter(ctx, capfd): assert result[0].column(1) == pa.array([-3]) +def test_dataset_count(ctx): + # `datafusion-python` issue: https://github.com/apache/datafusion-python/issues/800 + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + dataset = ds.dataset([batch]) + ctx.register_dataset("t", dataset) + + # Testing the dataframe API + df = ctx.table("t") + assert df.count() == 3 + + # Testing the SQL API + count = ctx.sql("SELECT COUNT(*) FROM t") + count = count.collect() + assert count[0].column(0) == pa.array([3]) + + def test_pyarrow_predicate_pushdown_is_null(ctx, capfd): """Ensure that pyarrow filter gets pushed down for `IsNull`""" # create a RecordBatch and register it as a pyarrow.dataset.Dataset From 1952e67d7e10aa18b63b6916be68ae851bee9b14 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:52:48 -0500 Subject: [PATCH 039/248] build(deps): bump prost-types from 0.13.2 to 0.13.3 (#881) Bumps [prost-types](https://github.com/tokio-rs/prost) from 0.13.2 to 0.13.3. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Changelog](https://github.com/tokio-rs/prost/blob/master/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/prost/compare/v0.13.2...v0.13.3) --- updated-dependencies: - dependency-name: prost-types dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d06073b6f..e48339e98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2270,9 +2270,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2ecbe40f08db5c006b5764a2645f7f3f141ce756412ac9e1dd6087e6d32995" +checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" dependencies = [ "bytes", "prost-derive", @@ -2301,9 +2301,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf0c195eebb4af52c752bec4f52f645da98b6e92077a04110c7f349477ae5ac" +checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" dependencies = [ "anyhow", "itertools", @@ -2314,9 +2314,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60caa6738c7369b940c3d49246a8d1749323674c65cb13010134f5c9bad5b519" +checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" dependencies = [ "prost", ] From 8b36aac1cd9eb16189e6464fd4df2a5213a7be7e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:53:03 -0500 Subject: [PATCH 040/248] build(deps): bump prost from 0.13.2 to 0.13.3 (#882) Bumps [prost](https://github.com/tokio-rs/prost) from 0.13.2 to 0.13.3. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Changelog](https://github.com/tokio-rs/prost/blob/master/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/prost/compare/v0.13.2...v0.13.3) --- updated-dependencies: - dependency-name: prost dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> From 2df33d33fcf3f01e83884f4e8943330cb6458d95 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Sat, 28 Sep 2024 17:08:36 -0500 Subject: [PATCH 041/248] chore: remove XFAIL from passing tests (#884) * remove xfail from passing tests * update xfail cmoment for datetime_s * document upstream issue for timestamp[s] xfail test --- python/datafusion/tests/test_sql.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/python/datafusion/tests/test_sql.py b/python/datafusion/tests/test_sql.py index cbb2e9f57..bd2ae58d7 100644 --- a/python/datafusion/tests/test_sql.py +++ b/python/datafusion/tests/test_sql.py @@ -378,23 +378,29 @@ def test_udf( helpers.data_binary_other(), helpers.data_date32(), helpers.data_with_nans(), - # C data interface missing pytest.param( pa.array([b"1111", b"2222", b"3333"], pa.binary(4), _null_mask), id="binary4", - marks=pytest.mark.xfail, ), + # `timestamp[s]` does not roundtrip for pyarrow.parquet: https://github.com/apache/arrow/issues/41382 pytest.param( - helpers.data_datetime("s"), id="datetime_s", marks=pytest.mark.xfail + helpers.data_datetime("s"), + id="datetime_s", + marks=pytest.mark.xfail( + reason="pyarrow.parquet does not support timestamp[s] roundtrips" + ), ), pytest.param( - helpers.data_datetime("ms"), id="datetime_ms", marks=pytest.mark.xfail + helpers.data_datetime("ms"), + id="datetime_ms", ), pytest.param( - helpers.data_datetime("us"), id="datetime_us", marks=pytest.mark.xfail + helpers.data_datetime("us"), + id="datetime_us", ), pytest.param( - helpers.data_datetime("ns"), id="datetime_ns", marks=pytest.mark.xfail + helpers.data_datetime("ns"), + id="datetime_ns", ), # Not writtable to parquet pytest.param( From f8224953b281bf157b3ddc18d88a7085852a6bdc Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Sun, 29 Sep 2024 14:19:55 +0200 Subject: [PATCH 042/248] feat: make register_csv accept a list of paths (#883) --- python/datafusion/context.py | 11 +++-- python/datafusion/tests/test_sql.py | 35 ++++++++++++++++ src/context.rs | 64 +++++++++++++++++++++++++---- 3 files changed, 99 insertions(+), 11 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 35a40ccd4..2c41faba6 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -714,7 +714,7 @@ def register_parquet( def register_csv( self, name: str, - path: str | pathlib.Path, + path: str | pathlib.Path | list[str | pathlib.Path], schema: pyarrow.Schema | None = None, has_header: bool = True, delimiter: str = ",", @@ -728,7 +728,7 @@ def register_csv( Args: name: Name of the table to register. - path: Path to the CSV file. + path: Path to the CSV file. It also accepts a list of Paths. schema: An optional schema representing the CSV file. If None, the CSV reader will try to infer it based on data in file. has_header: Whether the CSV file have a header. If schema inference @@ -741,9 +741,14 @@ def register_csv( selected for data input. file_compression_type: File compression type. """ + if isinstance(path, list): + path = [str(p) for p in path] + else: + path = str(path) + self.ctx.register_csv( name, - str(path), + path, schema, has_header, delimiter, diff --git a/python/datafusion/tests/test_sql.py b/python/datafusion/tests/test_sql.py index bd2ae58d7..e39a9f5c7 100644 --- a/python/datafusion/tests/test_sql.py +++ b/python/datafusion/tests/test_sql.py @@ -104,6 +104,41 @@ def test_register_csv(ctx, tmp_path): ctx.register_csv("csv4", path, file_compression_type="rar") +def test_register_csv_list(ctx, tmp_path): + path = tmp_path / "test.csv" + + int_values = [1, 2, 3, 4] + table = pa.Table.from_arrays( + [ + int_values, + ["a", "b", "c", "d"], + [1.1, 2.2, 3.3, 4.4], + ], + names=["int", "str", "float"], + ) + write_csv(table, path) + ctx.register_csv("csv", path) + + csv_df = ctx.table("csv") + expected_count = csv_df.count() * 2 + ctx.register_csv( + "double_csv", + path=[ + path, + path, + ], + ) + + double_csv_df = ctx.table("double_csv") + actual_count = double_csv_df.count() + assert actual_count == expected_count + + int_sum = ctx.sql("select sum(int) from double_csv").to_pydict()[ + "sum(double_csv.int)" + ][0] + assert int_sum == 2 * sum(int_values) + + def test_register_parquet(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) ctx.register_parquet("t", path) diff --git a/src/context.rs b/src/context.rs index 79db2e65c..7ad12ceb0 100644 --- a/src/context.rs +++ b/src/context.rs @@ -46,7 +46,8 @@ use crate::utils::{get_tokio_runtime, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; -use datafusion::common::ScalarValue; +use datafusion::catalog_common::TableReference; +use datafusion::common::{exec_err, ScalarValue}; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ @@ -54,9 +55,12 @@ use datafusion::datasource::listing::{ }; use datafusion::datasource::MemTable; use datafusion::datasource::TableProvider; -use datafusion::execution::context::{SQLOptions, SessionConfig, SessionContext, TaskContext}; +use datafusion::execution::context::{ + DataFilePaths, SQLOptions, SessionConfig, SessionContext, TaskContext, +}; use datafusion::execution::disk_manager::DiskManagerConfig; use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool, UnboundedMemoryPool}; +use datafusion::execution::options::ReadOptions; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::prelude::{ @@ -621,7 +625,7 @@ impl PySessionContext { pub fn register_csv( &mut self, name: &str, - path: PathBuf, + path: &Bound<'_, PyAny>, schema: Option>, has_header: bool, delimiter: &str, @@ -630,9 +634,6 @@ impl PySessionContext { file_compression_type: Option, py: Python, ) -> PyResult<()> { - let path = path - .to_str() - .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; let delimiter = delimiter.as_bytes(); if delimiter.len() != 1 { return Err(PyValueError::new_err( @@ -648,8 +649,15 @@ impl PySessionContext { .file_compression_type(parse_file_compression_type(file_compression_type)?); options.schema = schema.as_ref().map(|x| &x.0); - let result = self.ctx.register_csv(name, path, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + if path.is_instance_of::() { + let paths = path.extract::>()?; + let result = self.register_csv_from_multiple_paths(name, paths, options); + wait_for_future(py, result).map_err(DataFusionError::from)?; + } else { + let path = path.extract::()?; + let result = self.ctx.register_csv(name, &path, options); + wait_for_future(py, result).map_err(DataFusionError::from)?; + } Ok(()) } @@ -981,6 +989,46 @@ impl PySessionContext { async fn _table(&self, name: &str) -> datafusion::common::Result { self.ctx.table(name).await } + + async fn register_csv_from_multiple_paths( + &self, + name: &str, + table_paths: Vec, + options: CsvReadOptions<'_>, + ) -> datafusion::common::Result<()> { + let table_paths = table_paths.to_urls()?; + let session_config = self.ctx.copied_config(); + let listing_options = + options.to_listing_options(&session_config, self.ctx.copied_table_options()); + + let option_extension = listing_options.file_extension.clone(); + + if table_paths.is_empty() { + return exec_err!("No table paths were provided"); + } + + // check if the file extension matches the expected extension + for path in &table_paths { + let file_path = path.as_str(); + if !file_path.ends_with(option_extension.clone().as_str()) && !path.is_collection() { + return exec_err!( + "File path '{file_path}' does not match the expected extension '{option_extension}'" + ); + } + } + + let resolved_schema = options + .get_resolved_schema(&session_config, self.ctx.state(), table_paths[0].clone()) + .await?; + + let config = ListingTableConfig::new_with_multi_paths(table_paths) + .with_listing_options(listing_options) + .with_schema(resolved_schema); + let table = ListingTable::try_new(config)?; + self.ctx + .register_table(TableReference::Bare { table: name.into() }, Arc::new(table))?; + Ok(()) + } } pub fn convert_table_partition_cols( From 022e4b323c20dd306eac8c9b4ea65a9f23d9bcac Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 30 Sep 2024 07:43:23 -0400 Subject: [PATCH 043/248] Add user defined window function support (#880) * Adding PyWindowUDF and implementing PartitionEvaluator for it. Still requires python side work. * Add python wrappers for UDWF * adding unit tests for user defined window functions * Change udwf() to take an instance rather than a class so we can parameterize it * Pass multiple arrays for udwf evaluate so we can capture the order_by and also multiple columns * Update udwf to take multiple input columns * Add user exampe for UDWF * Update template for how values are passed to update * Add user documentation for UDWF * Updating documentation per PR review --- .../common-operations/udf-and-udfa.rst | 194 +++++++++-- examples/python-udwf.py | 270 ++++++++++++++++ python/datafusion/__init__.py | 5 +- python/datafusion/context.py | 7 +- python/datafusion/tests/test_udwf.py | 294 +++++++++++++++++ python/datafusion/udf.py | 254 ++++++++++++++- src/context.rs | 6 + src/lib.rs | 2 + src/udwf.rs | 305 ++++++++++++++++++ 9 files changed, 1306 insertions(+), 31 deletions(-) create mode 100644 examples/python-udwf.py create mode 100644 python/datafusion/tests/test_udwf.py create mode 100644 src/udwf.rs diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst index e9c142f0a..ffd7a05cb 100644 --- a/docs/source/user-guide/common-operations/udf-and-udfa.rst +++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst @@ -15,11 +15,24 @@ .. specific language governing permissions and limitations .. under the License. -User Defined Functions +User-Defined Functions ====================== -DataFusion provides powerful expressions and functions, reducing the need for custom Python functions. -However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs), with the :py:func:`~datafusion.udf.ScalarUDF.udf` function. +DataFusion provides powerful expressions and functions, reducing the need for custom Python +functions. However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs). + +Scalar Functions +---------------- + +When writing a user-defined function that can operate on a row by row basis, these are called Scalar +Functions. You can define your own scalar function by calling +:py:func:`~datafusion.udf.ScalarUDF.udf` . + +The basic definition of a scalar UDF is a python function that takes one or more +`pyarrow `_ arrays and returns a single array as +output. DataFusion scalar UDFs operate on an entire batch of records at a time, though the +evaluation of those records should be on a row by row basis. In the following example, we compute +if the input array contains null values. .. ipython:: python @@ -35,14 +48,67 @@ However you can still incorporate your own functions, i.e. User-Defined Function ctx = datafusion.SessionContext() batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + [pyarrow.array([1, None, 3]), pyarrow.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]], name="batch_array") - df.select(is_null_arr(col("a"))).to_pandas() + df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show() + +In the previous example, we used the fact that pyarrow provides a variety of built in array +functions such as ``is_null()``. There are additional pyarrow +`compute functions `_ available. When possible, +it is highly recommended to use these functions because they can perform computations without doing +any copy operations from the original arrays. This leads to greatly improved performance. + +If you need to perform an operation in python that is not available with the pyarrow compute +functions, you will need to convert the record batch into python values, perform your operation, +and construct an array. This operation of converting the built in data type of the array into a +python object can be one of the slowest operations in DataFusion, so it should be done sparingly. + +The following example performs the same operation as before with ``is_null`` but demonstrates +converting to Python objects to do the evaluation. + +.. ipython:: python + + import pyarrow + import datafusion + from datafusion import udf, col + + def is_null(array: pyarrow.Array) -> pyarrow.Array: + return pyarrow.array([value.as_py() is None for value in array]) + + is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), 'stable') + + ctx = datafusion.SessionContext() + + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, None, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]], name="batch_array") -Additionally the :py:func:`~datafusion.udf.AggregateUDF.udaf` function allows you to define User-Defined Aggregate Functions (UDAFs) + df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show() + +Aggregate Functions +------------------- + +The :py:func:`~datafusion.udf.AggregateUDF.udaf` function allows you to define User-Defined +Aggregate Functions (UDAFs). To use this you must implement an +:py:class:`~datafusion.udf.Accumulator` that determines how the aggregation is performed. + +When defining a UDAF there are four methods you need to implement. The ``update`` function takes the +array(s) of input and updates the internal state of the accumulator. You should define this function +to have as many input arguments as you will pass when calling the UDAF. Since aggregation may be +split into multiple batches, we must have a method to combine multiple batches. For this, we have +two functions, ``state`` and ``merge``. ``state`` will return an array of scalar values that contain +the current state of a single batch accumulation. Then we must ``merge`` the results of these +different states. Finally ``evaluate`` is the call that will return the final result after the +``merge`` is complete. + +In the following example we want to define a custom aggregate function that will return the +difference between the sum of two columns. The state can be represented by a single value and we can +also see how the inputs to ``update`` and ``merge`` differ. .. code-block:: python @@ -57,30 +123,122 @@ Additionally the :py:func:`~datafusion.udf.AggregateUDF.udaf` function allows yo Interface of a user-defined accumulation. """ def __init__(self): - self._sum = pyarrow.scalar(0.0) + self._sum = 0.0 - def update(self, values: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) + def update(self, values_a: pyarrow.Array, values_b: pyarrow.Array) -> None: + self._sum = self._sum + pyarrow.compute.sum(values_a).as_py() - pyarrow.compute.sum(values_b).as_py() def merge(self, states: List[pyarrow.Array]) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states[0]).as_py()) + self._sum = self._sum + pyarrow.compute.sum(states[0]).as_py() def state(self) -> pyarrow.Array: - return pyarrow.array([self._sum.as_py()]) + return pyarrow.array([self._sum]) def evaluate(self) -> pyarrow.Scalar: - return self._sum + return pyarrow.scalar(self._sum) ctx = datafusion.SessionContext() df = ctx.from_pydict( { - "a": [1, 2, 3], - "b": [4, 5, 6], + "a": [4, 5, 6], + "b": [1, 2, 3], } ) - my_udaf = udaf(MyAccumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()], 'stable') + my_udaf = udaf(MyAccumulator, [pyarrow.float64(), pyarrow.float64()], pyarrow.float64(), [pyarrow.float64()], 'stable') + + df.aggregate([], [my_udaf(col("a"), col("b")).alias("col_diff")]) + +Window Functions +---------------- + +To implement a User-Defined Window Function (UDWF) you must call the +:py:func:`~datafusion.udf.WindowUDF.udwf` function using a class that implements the abstract +class :py:class:`~datafusion.udf.WindowEvaluator`. + +There are three methods of evaluation of UDWFs. + +- ``evaluate`` is the simplest case, where you are given an array and are expected to calculate the + value for a single row of that array. This is the simplest case, but also the least performant. +- ``evaluate_all`` computes the values for all rows for an input array at a single time. +- ``evaluate_all_with_rank`` computes the values for all rows, but you only have the rank + information for the rows. + +Which methods you implement are based upon which of these options are set. + +.. list-table:: + :header-rows: 1 + + * - ``uses_window_frame`` + - ``supports_bounded_execution`` + - ``include_rank`` + - function_to_implement + * - False (default) + - False (default) + - False (default) + - ``evaluate_all`` + * - False + - True + - False + - ``evaluate`` + * - False + - True + - False + - ``evaluate_all_with_rank`` + * - True + - True/False + - True/False + - ``evaluate`` + +UDWF options +^^^^^^^^^^^^ + +When you define your UDWF you can override the functions that return these values. They will +determine which evaluate functions are called. + +- ``uses_window_frame`` is set for functions that compute based on the specified window frame. If + your function depends upon the specified frame, set this to ``True``. +- ``supports_bounded_execution`` specifies if your function can be incrementally computed. +- ``include_rank`` is set to ``True`` for window functions that can be computed only using the rank + information. + + +.. code-block:: python + + import pyarrow as pa + from datafusion import udwf, col, SessionContext + from datafusion.udf import WindowEvaluator + + class ExponentialSmooth(WindowEvaluator): + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + results = [] + curr_value = 0.0 + values = values[0] + for idx in range(num_rows): + if idx == 0: + curr_value = values[idx].as_py() + else: + curr_value = values[idx].as_py() * self.alpha + curr_value * ( + 1.0 - self.alpha + ) + results.append(curr_value) + + return pa.array(results) + + exp_smooth = udwf( + ExponentialSmooth(0.9), + pa.float64(), + pa.float64(), + volatility="immutable", + ) + + ctx = SessionContext() + + df = ctx.from_pydict({ + "a": [1.0, 2.1, 2.9, 4.0, 5.1, 6.0, 6.9, 8.0] + }) - df.aggregate([],[my_udaf(col("a"))]) + df.select("a", exp_smooth(col("a")).alias("smooth_a")).show() diff --git a/examples/python-udwf.py b/examples/python-udwf.py new file mode 100644 index 000000000..05b3021d8 --- /dev/null +++ b/examples/python-udwf.py @@ -0,0 +1,270 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +import datafusion +from datafusion import udwf, functions as f, col, lit +from datafusion.udf import WindowEvaluator +from datafusion.expr import WindowFrame + +# This example creates five different examples of user defined window functions in order +# to demonstrate the variety of ways a user may need to implement. + + +class ExponentialSmoothDefault(WindowEvaluator): + """Create a running smooth operation across an entire partition at once.""" + + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + results = [] + curr_value = 0.0 + values = values[0] + for idx in range(num_rows): + if idx == 0: + curr_value = values[idx].as_py() + else: + curr_value = values[idx].as_py() * self.alpha + curr_value * ( + 1.0 - self.alpha + ) + results.append(curr_value) + + return pa.array(results) + + +class SmoothBoundedFromPreviousRow(WindowEvaluator): + """Smooth over from the previous to current row only.""" + + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def supports_bounded_execution(self) -> bool: + return True + + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: + # Override the default range of current row since uses_window_frame is False + # So for the purpose of this test we just smooth from the previous row to + # current. + if idx == 0: + return (0, 0) + return (idx - 1, idx) + + def evaluate( + self, values: list[pa.Array], eval_range: tuple[int, int] + ) -> pa.Scalar: + (start, stop) = eval_range + curr_value = 0.0 + values = values[0] + for idx in range(start, stop + 1): + if idx == start: + curr_value = values[idx].as_py() + else: + curr_value = values[idx].as_py() * self.alpha + curr_value * ( + 1.0 - self.alpha + ) + return pa.scalar(curr_value).cast(pa.float64()) + + +class SmoothAcrossRank(WindowEvaluator): + """Smooth over the rank from the previous rank to current.""" + + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def include_rank(self) -> bool: + return True + + def evaluate_all_with_rank( + self, num_rows: int, ranks_in_partition: list[tuple[int, int]] + ) -> pa.Array: + results = [] + for idx in range(num_rows): + if idx == 0: + prior_value = 1.0 + matching_row = [ + i + for i in range(len(ranks_in_partition)) + if ranks_in_partition[i][0] <= idx and ranks_in_partition[i][1] > idx + ][0] + 1 + curr_value = matching_row * self.alpha + prior_value * (1.0 - self.alpha) + results.append(curr_value) + prior_value = matching_row + + return pa.array(results) + + +class ExponentialSmoothFrame(WindowEvaluator): + "Find the value across an entire frame using exponential smoothing" + + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def uses_window_frame(self) -> bool: + return True + + def evaluate( + self, values: list[pa.Array], eval_range: tuple[int, int] + ) -> pa.Scalar: + (start, stop) = eval_range + curr_value = 0.0 + if len(values) > 1: + order_by = values[1] # noqa: F841 + values = values[0] + else: + values = values[0] + for idx in range(start, stop): + if idx == start: + curr_value = values[idx].as_py() + else: + curr_value = values[idx].as_py() * self.alpha + curr_value * ( + 1.0 - self.alpha + ) + return pa.scalar(curr_value).cast(pa.float64()) + + +class SmoothTwoColumn(WindowEvaluator): + """Smooth once column based on a condition of another column. + + If the second column is above a threshold, then smooth over the first column from + the previous and next rows. + """ + + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + results = [] + values_a = values[0] + values_b = values[1] + for idx in range(num_rows): + if not values_b[idx].is_valid: + if idx == 0: + results.append(values_a[1].cast(pa.float64())) + elif idx == num_rows - 1: + results.append(values_a[num_rows - 2].cast(pa.float64())) + else: + results.append( + pa.scalar( + values_a[idx - 1].as_py() * self.alpha + + values_a[idx + 1].as_py() * (1.0 - self.alpha) + ) + ) + else: + results.append(values_a[idx].cast(pa.float64())) + + return pa.array(results) + + +# create a context +ctx = datafusion.SessionContext() + +# create a RecordBatch and a new DataFrame from it +batch = pa.RecordBatch.from_arrays( + [ + pa.array([1.0, 2.1, 2.9, 4.0, 5.1, 6.0, 6.9, 8.0]), + pa.array([1, 2, None, 4, 5, 6, None, 8]), + pa.array(["A", "A", "A", "A", "A", "B", "B", "B"]), + ], + names=["a", "b", "c"], +) +df = ctx.create_dataframe([[batch]]) + +exp_smooth = udwf( + ExponentialSmoothDefault(0.9), + pa.float64(), + pa.float64(), + volatility="immutable", +) + +smooth_two_row = udwf( + SmoothBoundedFromPreviousRow(0.9), + pa.float64(), + pa.float64(), + volatility="immutable", +) + +smooth_rank = udwf( + SmoothAcrossRank(0.9), + pa.float64(), + pa.float64(), + volatility="immutable", +) + +smooth_frame = udwf( + ExponentialSmoothFrame(0.9), + pa.float64(), + pa.float64(), + volatility="immutable", +) + +smooth_two_col = udwf( + SmoothTwoColumn(0.9), + [pa.float64(), pa.int64()], + pa.float64(), + volatility="immutable", +) + +# These are done with separate statements instead of one large `select` because that will +# attempt to combine the window operations and our defined UDFs do not all support that. +( + df.with_column("exp_smooth", exp_smooth(col("a"))) + .with_column("smooth_prior_row", smooth_two_row(col("a"))) + .with_column("smooth_rank", smooth_rank(col("a")).order_by(col("c")).build()) + .with_column("smooth_two_col", smooth_two_col(col("a"), col("b"))) + .with_column( + "smooth_frame", + smooth_frame(col("a")).window_frame(WindowFrame("rows", None, 0)).build(), + ) + .select( + "a", + "b", + "c", + "exp_smooth", + "smooth_prior_row", + "smooth_rank", + "smooth_two_col", + "smooth_frame", + ) +).show() + +assert df.select(f.round(exp_smooth(col("a")), lit(3))).collect()[0].column( + 0 +) == pa.array([1, 1.99, 2.809, 3.881, 4.978, 5.898, 6.8, 7.88]) + + +assert df.select(f.round(smooth_two_row(col("a")), lit(3))).collect()[0].column( + 0 +) == pa.array([1.0, 1.99, 2.82, 3.89, 4.99, 5.91, 6.81, 7.89]) + + +assert df.select(smooth_rank(col("a")).order_by(col("c")).build()).collect()[0].column( + 0 +) == pa.array([1, 1, 1, 1, 1, 1.9, 2.0, 2.0]) + + +assert df.select(smooth_two_col(col("a"), col("b"))).collect()[0].column(0) == pa.array( + [1, 2.1, 2.29, 4, 5.1, 6, 6.2, 8.0] +) + + +assert df.select( + f.round( + smooth_frame(col("a")).window_frame(WindowFrame("rows", None, 0)).build(), + lit(3), + ) +).collect()[0].column(0) == pa.array([1, 1.99, 2.809, 3.881, 4.978, 5.898, 6.8, 7.88]) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 08ca3fe02..4f40b2088 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -40,7 +40,7 @@ from .record_batch import RecordBatchStream, RecordBatch -from .udf import ScalarUDF, AggregateUDF, Accumulator +from .udf import ScalarUDF, AggregateUDF, Accumulator, WindowUDF from .common import ( DFSchema, @@ -78,6 +78,7 @@ "Database", "Table", "AggregateUDF", + "WindowUDF", "LogicalPlan", "ExecutionPlan", "RecordBatch", @@ -113,3 +114,5 @@ def lit(value): udf = ScalarUDF.udf udaf = AggregateUDF.udaf + +udwf = WindowUDF.udwf diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 2c41faba6..5b52d397b 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -25,12 +25,11 @@ from ._internal import SessionContext as SessionContextInternal from ._internal import LogicalPlan, ExecutionPlan -from datafusion._internal import AggregateUDF from datafusion.catalog import Catalog, Table from datafusion.dataframe import DataFrame from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list from datafusion.record_batch import RecordBatchStream -from datafusion.udf import ScalarUDF +from datafusion.udf import ScalarUDF, AggregateUDF, WindowUDF from typing import Any, TYPE_CHECKING from typing_extensions import deprecated @@ -838,6 +837,10 @@ def register_udaf(self, udaf: AggregateUDF) -> None: """Register a user-defined aggregation function (UDAF) with the context.""" self.ctx.register_udaf(udaf._udaf) + def register_udwf(self, udwf: WindowUDF) -> None: + """Register a user-defined window function (UDWF) with the context.""" + self.ctx.register_udwf(udwf._udwf) + def catalog(self, name: str = "datafusion") -> Catalog: """Retrieve a catalog by name.""" return self.ctx.catalog(name) diff --git a/python/datafusion/tests/test_udwf.py b/python/datafusion/tests/test_udwf.py new file mode 100644 index 000000000..67c0979fe --- /dev/null +++ b/python/datafusion/tests/test_udwf.py @@ -0,0 +1,294 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +import pytest + +from datafusion import SessionContext, column, udwf, lit, functions as f +from datafusion.udf import WindowEvaluator +from datafusion.expr import WindowFrame + + +class ExponentialSmoothDefault(WindowEvaluator): + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + results = [] + curr_value = 0.0 + values = values[0] + for idx in range(num_rows): + if idx == 0: + curr_value = values[idx].as_py() + else: + curr_value = values[idx].as_py() * self.alpha + curr_value * ( + 1.0 - self.alpha + ) + results.append(curr_value) + + return pa.array(results) + + +class ExponentialSmoothBounded(WindowEvaluator): + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def supports_bounded_execution(self) -> bool: + return True + + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: + # Override the default range of current row since uses_window_frame is False + # So for the purpose of this test we just smooth from the previous row to + # current. + if idx == 0: + return (0, 0) + return (idx - 1, idx) + + def evaluate( + self, values: list[pa.Array], eval_range: tuple[int, int] + ) -> pa.Scalar: + (start, stop) = eval_range + curr_value = 0.0 + values = values[0] + for idx in range(start, stop + 1): + if idx == start: + curr_value = values[idx].as_py() + else: + curr_value = values[idx].as_py() * self.alpha + curr_value * ( + 1.0 - self.alpha + ) + return pa.scalar(curr_value).cast(pa.float64()) + + +class ExponentialSmoothRank(WindowEvaluator): + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def include_rank(self) -> bool: + return True + + def evaluate_all_with_rank( + self, num_rows: int, ranks_in_partition: list[tuple[int, int]] + ) -> pa.Array: + results = [] + for idx in range(num_rows): + if idx == 0: + prior_value = 1.0 + matching_row = [ + i + for i in range(len(ranks_in_partition)) + if ranks_in_partition[i][0] <= idx and ranks_in_partition[i][1] > idx + ][0] + 1 + curr_value = matching_row * self.alpha + prior_value * (1.0 - self.alpha) + results.append(curr_value) + prior_value = matching_row + + return pa.array(results) + + +class ExponentialSmoothFrame(WindowEvaluator): + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def uses_window_frame(self) -> bool: + return True + + def evaluate( + self, values: list[pa.Array], eval_range: tuple[int, int] + ) -> pa.Scalar: + (start, stop) = eval_range + curr_value = 0.0 + if len(values) > 1: + order_by = values[1] # noqa: F841 + values = values[0] + else: + values = values[0] + for idx in range(start, stop): + if idx == start: + curr_value = values[idx].as_py() + else: + curr_value = values[idx].as_py() * self.alpha + curr_value * ( + 1.0 - self.alpha + ) + return pa.scalar(curr_value).cast(pa.float64()) + + +class SmoothTwoColumn(WindowEvaluator): + """This class demonstrates using two columns. + + If the second column is above a threshold, then smooth over the first column from + the previous and next rows. + """ + + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + results = [] + values_a = values[0] + values_b = values[1] + for idx in range(num_rows): + if values_b[idx].as_py() > 7: + if idx == 0: + results.append(values_a[1].cast(pa.float64())) + elif idx == num_rows - 1: + results.append(values_a[num_rows - 2].cast(pa.float64())) + else: + results.append( + pa.scalar( + values_a[idx - 1].as_py() * self.alpha + + values_a[idx + 1].as_py() * (1.0 - self.alpha) + ) + ) + else: + results.append(values_a[idx].cast(pa.float64())) + + return pa.array(results) + + +class NotSubclassOfWindowEvaluator: + pass + + +@pytest.fixture +def df(): + ctx = SessionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [ + pa.array([0, 1, 2, 3, 4, 5, 6]), + pa.array([7, 4, 3, 8, 9, 1, 6]), + pa.array(["A", "A", "A", "A", "B", "B", "B"]), + ], + names=["a", "b", "c"], + ) + return ctx.create_dataframe([[batch]]) + + +def test_udwf_errors(df): + with pytest.raises(TypeError): + udwf( + NotSubclassOfWindowEvaluator(), + pa.float64(), + pa.float64(), + volatility="immutable", + ) + + +smooth_default = udwf( + ExponentialSmoothDefault(0.9), + pa.float64(), + pa.float64(), + volatility="immutable", +) + +smooth_bounded = udwf( + ExponentialSmoothBounded(0.9), + pa.float64(), + pa.float64(), + volatility="immutable", +) + +smooth_rank = udwf( + ExponentialSmoothRank(0.9), + pa.utf8(), + pa.float64(), + volatility="immutable", +) + +smooth_frame = udwf( + ExponentialSmoothFrame(0.9), + pa.float64(), + pa.float64(), + volatility="immutable", +) + +smooth_two_col = udwf( + SmoothTwoColumn(0.9), + [pa.int64(), pa.int64()], + pa.float64(), + volatility="immutable", +) + +data_test_udwf_functions = [ + ( + "default_udwf", + smooth_default(column("a")), + [0, 0.9, 1.89, 2.889, 3.889, 4.889, 5.889], + ), + ( + "default_udwf_partitioned", + smooth_default(column("a")).partition_by(column("c")).build(), + [0, 0.9, 1.89, 2.889, 4.0, 4.9, 5.89], + ), + ( + "default_udwf_ordered", + smooth_default(column("a")).order_by(column("b")).build(), + [0.551, 1.13, 2.3, 2.755, 3.876, 5.0, 5.513], + ), + ( + "bounded_udwf", + smooth_bounded(column("a")), + [0, 0.9, 1.9, 2.9, 3.9, 4.9, 5.9], + ), + ( + "bounded_udwf_ignores_frame", + smooth_bounded(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build(), + [0, 0.9, 1.9, 2.9, 3.9, 4.9, 5.9], + ), + ( + "rank_udwf", + smooth_rank(column("c")).order_by(column("c")).build(), + [1, 1, 1, 1, 1.9, 2, 2], + ), + ( + "frame_unbounded_udwf", + smooth_frame(column("a")).window_frame(WindowFrame("rows", None, None)).build(), + [5.889, 5.889, 5.889, 5.889, 5.889, 5.889, 5.889], + ), + ( + "frame_bounded_udwf", + smooth_frame(column("a")).window_frame(WindowFrame("rows", None, 0)).build(), + [0.0, 0.9, 1.89, 2.889, 3.889, 4.889, 5.889], + ), + ( + "frame_bounded_udwf", + smooth_frame(column("a")) + .window_frame(WindowFrame("rows", None, 0)) + .order_by(column("b")) + .build(), + [0.551, 1.13, 2.3, 2.755, 3.876, 5.0, 5.513], + ), + ( + "two_column_udwf", + smooth_two_col(column("a"), column("b")), + [0.0, 1.0, 2.0, 2.2, 3.2, 5.0, 6.0], + ), +] + + +@pytest.mark.parametrize("name,expr,expected", data_test_udwf_functions) +def test_udwf_functions(df, name, expr, expected): + df = df.select("a", "b", f.round(expr, lit(3)).alias(name)) + + # execute and collect the first (and only) batch + result = df.sort(column("a")).select(column(name)).collect()[0] + + assert result.column(0) == pa.array(expected) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index f74d675e3..bb7a90866 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -"""Provides the user defined functions for evaluation of dataframes.""" +"""Provides the user-defined functions for evaluation of dataframes.""" from __future__ import annotations @@ -76,7 +76,7 @@ def __str__(self): class ScalarUDF: - """Class for performing scalar user defined functions (UDF). + """Class for performing scalar user-defined functions (UDF). Scalar UDFs operate on a row by row basis. See also :py:class:`AggregateUDF` for operating on a group of rows. @@ -90,7 +90,7 @@ def __init__( return_type: _R, volatility: Volatility | str, ) -> None: - """Instantiate a scalar user defined function (UDF). + """Instantiate a scalar user-defined function (UDF). See helper method :py:func:`udf` for argument details. """ @@ -115,7 +115,7 @@ def udf( volatility: Volatility | str, name: str | None = None, ) -> ScalarUDF: - """Create a new User Defined Function. + """Create a new User-Defined Function. Args: func: A callable python function. @@ -127,7 +127,7 @@ def udf( name: A descriptive name for the function. Returns: - A user defined aggregate function, which can be used in either data + A user-defined aggregate function, which can be used in either data aggregation or window function calls. """ if not callable(func): @@ -152,7 +152,7 @@ def state(self) -> List[pyarrow.Scalar]: pass @abstractmethod - def update(self, values: pyarrow.Array) -> None: + def update(self, *values: pyarrow.Array) -> None: """Evaluate an array of values and update state.""" pass @@ -172,7 +172,7 @@ def evaluate(self) -> pyarrow.Scalar: class AggregateUDF: - """Class for performing scalar user defined functions (UDF). + """Class for performing scalar user-defined functions (UDF). Aggregate UDFs operate on a group of rows and return a single value. See also :py:class:`ScalarUDF` for operating on a row by row basis. @@ -187,7 +187,7 @@ def __init__( state_type: list[pyarrow.DataType], volatility: Volatility | str, ) -> None: - """Instantiate a user defined aggregate function (UDAF). + """Instantiate a user-defined aggregate function (UDAF). See :py:func:`udaf` for a convenience function and argument descriptions. @@ -214,7 +214,7 @@ def udaf( volatility: Volatility | str, name: str | None = None, ) -> AggregateUDF: - """Create a new User Defined Aggregate Function. + """Create a new User-Defined Aggregate Function. The accumulator function must be callable and implement :py:class:`Accumulator`. @@ -227,7 +227,7 @@ def udaf( name: A descriptive name for the function. Returns: - A user defined aggregate function, which can be used in either data + A user-defined aggregate function, which can be used in either data aggregation or window function calls. """ if not issubclass(accum, Accumulator): @@ -246,3 +246,237 @@ def udaf( state_type=state_type, volatility=volatility, ) + + +class WindowEvaluator(metaclass=ABCMeta): + """Evaluator class for user-defined window functions (UDWF). + + It is up to the user to decide which evaluate function is appropriate. + + +------------------------+--------------------------------+------------------+---------------------------+ + | ``uses_window_frame`` | ``supports_bounded_execution`` | ``include_rank`` | function_to_implement | + +========================+================================+==================+===========================+ + | False (default) | False (default) | False (default) | ``evaluate_all`` | + +------------------------+--------------------------------+------------------+---------------------------+ + | False | True | False | ``evaluate`` | + +------------------------+--------------------------------+------------------+---------------------------+ + | False | True/False | True | ``evaluate_all_with_rank``| + +------------------------+--------------------------------+------------------+---------------------------+ + | True | True/False | True/False | ``evaluate`` | + +------------------------+--------------------------------+------------------+---------------------------+ + """ # noqa: W505 + + def memoize(self) -> None: + """Perform a memoize operation to improve performance. + + When the window frame has a fixed beginning (e.g UNBOUNDED + PRECEDING), some functions such as FIRST_VALUE and + NTH_VALUE do not need the (unbounded) input once they have + seen a certain amount of input. + + `memoize` is called after each input batch is processed, and + such functions can save whatever they need + """ + pass + + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: + """Return the range for the window fuction. + + If `uses_window_frame` flag is `false`. This method is used to + calculate required range for the window function during + stateful execution. + + Generally there is no required range, hence by default this + returns smallest range(current row). e.g seeing current row is + enough to calculate window result (such as row_number, rank, + etc) + + Args: + idx:: Current index + num_rows: Number of rows. + """ + return (idx, idx + 1) + + def is_causal(self) -> bool: + """Get whether evaluator needs future data for its result.""" + return False + + def evaluate_all(self, values: list[pyarrow.Array], num_rows: int) -> pyarrow.Array: + """Evaluate a window function on an entire input partition. + + This function is called once per input *partition* for window functions that + *do not use* values from the window frame, such as + :py:func:`~datafusion.functions.row_number`, :py:func:`~datafusion.functions.rank`, + :py:func:`~datafusion.functions.dense_rank`, :py:func:`~datafusion.functions.percent_rank`, + :py:func:`~datafusion.functions.cume_dist`, :py:func:`~datafusion.functions.lead`, + and :py:func:`~datafusion.functions.lag`. + + It produces the result of all rows in a single pass. It + expects to receive the entire partition as the ``value`` and + must produce an output column with one output row for every + input row. + + ``num_rows`` is required to correctly compute the output in case + ``len(values) == 0`` + + Implementing this function is an optimization. Certain window + functions are not affected by the window frame definition or + the query doesn't have a frame, and ``evaluate`` skips the + (costly) window frame boundary calculation and the overhead of + calling ``evaluate`` for each output row. + + For example, the `LAG` built in window function does not use + the values of its window frame (it can be computed in one shot + on the entire partition with ``Self::evaluate_all`` regardless of the + window defined in the ``OVER`` clause) + + .. code-block:: text + + lag(x, 1) OVER (ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) + + However, ``avg()`` computes the average in the window and thus + does use its window frame. + + .. code-block:: text + + avg(x) OVER (PARTITION BY y ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) + """ # noqa: W505 + pass + + def evaluate( + self, values: list[pyarrow.Array], eval_range: tuple[int, int] + ) -> pyarrow.Scalar: + """Evaluate window function on a range of rows in an input partition. + + This is the simplest and most general function to implement + but also the least performant as it creates output one row at + a time. It is typically much faster to implement stateful + evaluation using one of the other specialized methods on this + trait. + + Returns a [`ScalarValue`] that is the value of the window + function within `range` for the entire partition. Argument + `values` contains the evaluation result of function arguments + and evaluation results of ORDER BY expressions. If function has a + single argument, `values[1..]` will contain ORDER BY expression results. + """ + pass + + def evaluate_all_with_rank( + self, num_rows: int, ranks_in_partition: list[tuple[int, int]] + ) -> pyarrow.Array: + """Called for window functions that only need the rank of a row. + + Evaluate the partition evaluator against the partition using + the row ranks. For example, ``rank(col("a"))`` produces + + .. code-block:: text + + a | rank + - + ---- + A | 1 + A | 1 + C | 3 + D | 4 + D | 4 + + For this case, `num_rows` would be `5` and the + `ranks_in_partition` would be called with + + .. code-block:: text + + [ + (0,1), + (2,2), + (3,4), + ] + + The user must implement this method if ``include_rank`` returns True. + """ + pass + + def supports_bounded_execution(self) -> bool: + """Can the window function be incrementally computed using bounded memory?""" + return False + + def uses_window_frame(self) -> bool: + """Does the window function use the values from the window frame?""" + return False + + def include_rank(self) -> bool: + """Can this function be evaluated with (only) rank?""" + return False + + +if TYPE_CHECKING: + _W = TypeVar("_W", bound=WindowEvaluator) + + +class WindowUDF: + """Class for performing window user-defined functions (UDF). + + Window UDFs operate on a partition of rows. See + also :py:class:`ScalarUDF` for operating on a row by row basis. + """ + + def __init__( + self, + name: str | None, + func: WindowEvaluator, + input_types: list[pyarrow.DataType], + return_type: pyarrow.DataType, + volatility: Volatility | str, + ) -> None: + """Instantiate a user-defined window function (UDWF). + + See :py:func:`udwf` for a convenience function and argument + descriptions. + """ + self._udwf = df_internal.WindowUDF( + name, func, input_types, return_type, str(volatility) + ) + + def __call__(self, *args: Expr) -> Expr: + """Execute the UDWF. + + This function is not typically called by an end user. These calls will + occur during the evaluation of the dataframe. + """ + args_raw = [arg.expr for arg in args] + return Expr(self._udwf.__call__(*args_raw)) + + @staticmethod + def udwf( + func: WindowEvaluator, + input_types: pyarrow.DataType | list[pyarrow.DataType], + return_type: pyarrow.DataType, + volatility: Volatility | str, + name: str | None = None, + ) -> WindowUDF: + """Create a new User-Defined Window Function. + + Args: + func: The python function. + input_types: The data types of the arguments to ``func``. + return_type: The data type of the return value. + volatility: See :py:class:`Volatility` for allowed values. + name: A descriptive name for the function. + + Returns: + A user-defined window function. + """ + if not isinstance(func, WindowEvaluator): + raise TypeError( + "`func` must implement the abstract base class WindowEvaluator" + ) + if name is None: + name = func.__class__.__qualname__.lower() + if isinstance(input_types, pyarrow.DataType): + input_types = [input_types] + return WindowUDF( + name=name, + func=func, + input_types=input_types, + return_type=return_type, + volatility=volatility, + ) diff --git a/src/context.rs b/src/context.rs index 7ad12ceb0..fde442ce4 100644 --- a/src/context.rs +++ b/src/context.rs @@ -42,6 +42,7 @@ use crate::sql::logical::PyLogicalPlan; use crate::store::StorageContexts; use crate::udaf::PyAggregateUDF; use crate::udf::PyScalarUDF; +use crate::udwf::PyWindowUDF; use crate::utils::{get_tokio_runtime, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; @@ -754,6 +755,11 @@ impl PySessionContext { Ok(()) } + pub fn register_udwf(&mut self, udwf: PyWindowUDF) -> PyResult<()> { + self.ctx.register_udwf(udwf.function); + Ok(()) + } + #[pyo3(signature = (name="datafusion"))] pub fn catalog(&self, name: &str) -> PyResult { match self.ctx.catalog(name) { diff --git a/src/lib.rs b/src/lib.rs index e4cc24078..98821833d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -58,6 +58,7 @@ pub mod substrait; mod udaf; #[allow(clippy::borrow_deref_ref)] mod udf; +mod udwf; pub mod utils; #[cfg(feature = "mimalloc")] @@ -90,6 +91,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/src/udwf.rs b/src/udwf.rs new file mode 100644 index 000000000..31cc5e60e --- /dev/null +++ b/src/udwf.rs @@ -0,0 +1,305 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::ops::Range; +use std::sync::Arc; + +use arrow::array::{make_array, Array, ArrayData, ArrayRef}; +use datafusion::logical_expr::window_state::WindowAggState; +use datafusion::scalar::ScalarValue; +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; + +use datafusion::arrow::datatypes::DataType; +use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::{ + PartitionEvaluator, PartitionEvaluatorFactory, Signature, Volatility, WindowUDF, WindowUDFImpl, +}; +use pyo3::types::{PyList, PyTuple}; + +use crate::expr::PyExpr; +use crate::utils::parse_volatility; + +#[derive(Debug)] +struct RustPartitionEvaluator { + evaluator: PyObject, +} + +impl RustPartitionEvaluator { + fn new(evaluator: PyObject) -> Self { + Self { evaluator } + } +} + +impl PartitionEvaluator for RustPartitionEvaluator { + fn memoize(&mut self, _state: &mut WindowAggState) -> Result<()> { + Python::with_gil(|py| self.evaluator.bind(py).call_method0("memoize").map(|_| ())) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) + } + + fn get_range(&self, idx: usize, n_rows: usize) -> Result> { + Python::with_gil(|py| { + let py_args = vec![idx.to_object(py), n_rows.to_object(py)]; + let py_args = PyTuple::new_bound(py, py_args); + + self.evaluator + .bind(py) + .call_method1("get_range", py_args) + .and_then(|v| { + let tuple: Bound<'_, PyTuple> = v.extract()?; + if tuple.len() != 2 { + return Err(PyValueError::new_err(format!( + "Expected get_range to return tuple of length 2. Received length {}", + tuple.len() + ))); + } + + let start: usize = tuple.get_item(0).unwrap().extract()?; + let end: usize = tuple.get_item(1).unwrap().extract()?; + + Ok(Range { start, end }) + }) + }) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) + } + + fn is_causal(&self) -> bool { + Python::with_gil(|py| { + self.evaluator + .bind(py) + .call_method0("is_causal") + .and_then(|v| v.extract()) + .unwrap_or(false) + }) + } + + fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result { + Python::with_gil(|py| { + let py_values = PyList::new_bound( + py, + values + .iter() + .map(|arg| arg.into_data().to_pyarrow(py).unwrap()), + ); + let py_num_rows = num_rows.to_object(py).into_bound(py); + let py_args = PyTuple::new_bound( + py, + PyTuple::new_bound(py, vec![py_values.as_any(), &py_num_rows]), + ); + + self.evaluator + .bind(py) + .call_method1("evaluate_all", py_args) + .map(|v| { + let array_data = ArrayData::from_pyarrow_bound(&v).unwrap(); + make_array(array_data) + }) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) + }) + } + + fn evaluate(&mut self, values: &[ArrayRef], range: &Range) -> Result { + Python::with_gil(|py| { + let py_values = PyList::new_bound( + py, + values + .iter() + .map(|arg| arg.into_data().to_pyarrow(py).unwrap()), + ); + let range_tuple = + PyTuple::new_bound(py, vec![range.start.to_object(py), range.end.to_object(py)]); + let py_args = PyTuple::new_bound( + py, + PyTuple::new_bound(py, vec![py_values.as_any(), range_tuple.as_any()]), + ); + + self.evaluator + .bind(py) + .call_method1("evaluate", py_args) + .and_then(|v| v.extract()) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) + }) + } + + fn evaluate_all_with_rank( + &self, + num_rows: usize, + ranks_in_partition: &[Range], + ) -> Result { + Python::with_gil(|py| { + let ranks = ranks_in_partition + .iter() + .map(|r| PyTuple::new_bound(py, vec![r.start, r.end])); + + // 1. cast args to Pyarrow array + let py_args = vec![num_rows.to_object(py), PyList::new_bound(py, ranks).into()]; + + let py_args = PyTuple::new_bound(py, py_args); + + // 2. call function + self.evaluator + .bind(py) + .call_method1("evaluate_all_with_rank", py_args) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) + .map(|v| { + let array_data = ArrayData::from_pyarrow_bound(&v).unwrap(); + make_array(array_data) + }) + }) + } + + fn supports_bounded_execution(&self) -> bool { + Python::with_gil(|py| { + self.evaluator + .bind(py) + .call_method0("supports_bounded_execution") + .and_then(|v| v.extract()) + .unwrap_or(false) + }) + } + + fn uses_window_frame(&self) -> bool { + Python::with_gil(|py| { + self.evaluator + .bind(py) + .call_method0("uses_window_frame") + .and_then(|v| v.extract()) + .unwrap_or(false) + }) + } + + fn include_rank(&self) -> bool { + Python::with_gil(|py| { + self.evaluator + .bind(py) + .call_method0("include_rank") + .and_then(|v| v.extract()) + .unwrap_or(false) + }) + } +} + +pub fn to_rust_partition_evaluator(evaluator: PyObject) -> PartitionEvaluatorFactory { + Arc::new(move || -> Result> { + let evaluator = Python::with_gil(|py| evaluator.clone_ref(py)); + Ok(Box::new(RustPartitionEvaluator::new(evaluator))) + }) +} + +/// Represents an WindowUDF +#[pyclass(name = "WindowUDF", module = "datafusion", subclass)] +#[derive(Debug, Clone)] +pub struct PyWindowUDF { + pub(crate) function: WindowUDF, +} + +#[pymethods] +impl PyWindowUDF { + #[new] + #[pyo3(signature=(name, evaluator, input_types, return_type, volatility))] + fn new( + name: &str, + evaluator: PyObject, + input_types: Vec>, + return_type: PyArrowType, + volatility: &str, + ) -> PyResult { + let return_type = return_type.0; + let input_types = input_types.into_iter().map(|t| t.0).collect(); + + let function = WindowUDF::from(MultiColumnWindowUDF::new( + name, + input_types, + return_type, + parse_volatility(volatility)?, + to_rust_partition_evaluator(evaluator), + )); + Ok(Self { function }) + } + + /// creates a new PyExpr with the call of the udf + #[pyo3(signature = (*args))] + fn __call__(&self, args: Vec) -> PyResult { + let args = args.iter().map(|e| e.expr.clone()).collect(); + Ok(self.function.call(args).into()) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("WindowUDF({})", self.function.name())) + } +} + +pub struct MultiColumnWindowUDF { + name: String, + signature: Signature, + return_type: DataType, + partition_evaluator_factory: PartitionEvaluatorFactory, +} + +impl std::fmt::Debug for MultiColumnWindowUDF { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("WindowUDF") + .field("name", &self.name) + .field("signature", &self.signature) + .field("return_type", &"") + .field("partition_evaluator_factory", &"") + .finish() + } +} + +impl MultiColumnWindowUDF { + pub fn new( + name: impl Into, + input_types: Vec, + return_type: DataType, + volatility: Volatility, + partition_evaluator_factory: PartitionEvaluatorFactory, + ) -> Self { + let name = name.into(); + let signature = Signature::exact(input_types, volatility); + Self { + name, + signature, + return_type, + partition_evaluator_factory, + } + } +} + +impl WindowUDFImpl for MultiColumnWindowUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + &self.name + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(self.return_type.clone()) + } + + fn partition_evaluator(&self) -> Result> { + (self.partition_evaluator_factory)() + } +} From c49ca9ae299ec14048b5f44c81ff5ccd11ea11ba Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Sep 2024 13:39:43 -0500 Subject: [PATCH 044/248] build(deps): bump syn from 2.0.77 to 2.0.79 (#886) Bumps [syn](https://github.com/dtolnay/syn) from 2.0.77 to 2.0.79. - [Release notes](https://github.com/dtolnay/syn/releases) - [Commits](https://github.com/dtolnay/syn/compare/2.0.77...2.0.79) --- updated-dependencies: - dependency-name: syn dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e48339e98..da5d19a93 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3077,9 +3077,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.77" +version = "2.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 11dccc4fb..54e53f4b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,7 @@ futures = "0.3" object_store = { version = "0.11.0", features = ["aws", "gcp", "azure"] } parking_lot = "0.12" regex-syntax = "0.8" -syn = "2.0.68" +syn = "2.0.79" url = "2" [build-dependencies] From 97e330b9469f948da00e777d9e089e0181c1d5bf Mon Sep 17 00:00:00 2001 From: Sergey Fedoseev Date: Fri, 4 Oct 2024 14:37:07 +0200 Subject: [PATCH 045/248] fix example of reading parquet from s3 (#896) --- examples/sql-parquet-s3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sql-parquet-s3.py b/examples/sql-parquet-s3.py index bd7da5e20..61f1e0c50 100644 --- a/examples/sql-parquet-s3.py +++ b/examples/sql-parquet-s3.py @@ -31,7 +31,7 @@ ctx = datafusion.SessionContext() path = f"s3://{bucket_name}/" -ctx.register_object_store(path, s3) +ctx.register_object_store("s3://", s3, None) ctx.register_parquet("trips", path) From 5b9e528b1120676eb224c0581ece928f58f60d14 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Fri, 4 Oct 2024 09:36:57 -0500 Subject: [PATCH 046/248] release-testing (#889) * make script shebangs more flexible * move location of python tests This change allows us to run the test suite against a release candidate hosted on test.pypi.org. ``` pip install --extra-index-url https://test.pypi.org/simple/ datafusion==40.0.0 pytest --import-mode=importlib python/tests ``` * add documentation for testing against release candidate * add dev script for cleaning maturin build artifacts * update ruff lint config for new test location --- dev/clean.sh | 62 +++++++++++++++++++ dev/python_lint.sh | 2 +- dev/release/README.md | 23 ++++++- dev/release/verify-release-candidate.sh | 2 +- dev/rust_lint.sh | 2 +- docs/build.sh | 2 +- pyproject.toml | 2 +- python/{datafusion => }/tests/__init__.py | 0 python/{datafusion => }/tests/conftest.py | 0 .../tests/data_test_context/data.json | 0 python/{datafusion => }/tests/generic.py | 0 .../tests/test_aggregation.py | 0 python/{datafusion => }/tests/test_catalog.py | 0 python/{datafusion => }/tests/test_config.py | 0 python/{datafusion => }/tests/test_context.py | 0 .../{datafusion => }/tests/test_dataframe.py | 0 python/{datafusion => }/tests/test_expr.py | 0 .../{datafusion => }/tests/test_functions.py | 0 python/{datafusion => }/tests/test_imports.py | 0 .../{datafusion => }/tests/test_indexing.py | 0 python/{datafusion => }/tests/test_input.py | 0 python/{datafusion => }/tests/test_sql.py | 0 python/{datafusion => }/tests/test_store.py | 0 .../{datafusion => }/tests/test_substrait.py | 0 python/{datafusion => }/tests/test_udaf.py | 0 python/{datafusion => }/tests/test_udwf.py | 0 .../tests/test_wrapper_coverage.py | 0 27 files changed, 88 insertions(+), 7 deletions(-) create mode 100755 dev/clean.sh rename python/{datafusion => }/tests/__init__.py (100%) rename python/{datafusion => }/tests/conftest.py (100%) rename python/{datafusion => }/tests/data_test_context/data.json (100%) rename python/{datafusion => }/tests/generic.py (100%) rename python/{datafusion => }/tests/test_aggregation.py (100%) rename python/{datafusion => }/tests/test_catalog.py (100%) rename python/{datafusion => }/tests/test_config.py (100%) rename python/{datafusion => }/tests/test_context.py (100%) rename python/{datafusion => }/tests/test_dataframe.py (100%) rename python/{datafusion => }/tests/test_expr.py (100%) rename python/{datafusion => }/tests/test_functions.py (100%) rename python/{datafusion => }/tests/test_imports.py (100%) rename python/{datafusion => }/tests/test_indexing.py (100%) rename python/{datafusion => }/tests/test_input.py (100%) rename python/{datafusion => }/tests/test_sql.py (100%) rename python/{datafusion => }/tests/test_store.py (100%) rename python/{datafusion => }/tests/test_substrait.py (100%) rename python/{datafusion => }/tests/test_udaf.py (100%) rename python/{datafusion => }/tests/test_udwf.py (100%) rename python/{datafusion => }/tests/test_wrapper_coverage.py (100%) diff --git a/dev/clean.sh b/dev/clean.sh new file mode 100755 index 000000000..0d86680e8 --- /dev/null +++ b/dev/clean.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This cleans up the project by removing build artifacts and other generated files. + +# Function to remove a directory and print the action +remove_dir() { + if [ -d "$1" ]; then + echo "Removing directory: $1" + rm -rf "$1" + fi +} + +# Function to remove a file and print the action +remove_file() { + if [ -f "$1" ]; then + echo "Removing file: $1" + rm -f "$1" + fi +} + +# Remove .pytest_cache directory +remove_dir .pytest_cache/ + +# Remove target directory +remove_dir target/ + +# Remove any __pycache__ directories +find python/ -type d -name "__pycache__" -print | while read -r dir; do + remove_dir "$dir" +done + +# Remove pytest-coverage.lcov file +# remove_file .coverage +# remove_file pytest-coverage.lcov + +# Remove rust-coverage.lcov file +# remove_file rust-coverage.lcov + +# Remove pyo3 files +find python/ -type f -name '_internal.*.so' -print | while read -r file; do + remove_file "$file" +done + +echo "Cleanup complete." \ No newline at end of file diff --git a/dev/python_lint.sh b/dev/python_lint.sh index 3bc67fb12..29f0d4833 100755 --- a/dev/python_lint.sh +++ b/dev/python_lint.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file diff --git a/dev/release/README.md b/dev/release/README.md index 93c2f97b9..49fd9de2d 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -159,10 +159,29 @@ Send the email to start the vote. ## Verifying a Release -Install the release from testpypi: +Running the unit tests against a testpypi release candidate: ```bash -pip install --extra-index-url https://test.pypi.org/simple/ datafusion==0.7.0 +# clone a fresh repo +git clone https://github.com/apache/datafusion-python.git +cd datafusion-python + +# checkout the release commit +git fetch --tags +git checkout 40.0.0-rc1 + +# create the env +python3 -m venv venv +source venv/bin/activate + +# install release candidate +pip install --extra-index-url https://test.pypi.org/simple/ datafusion==40.0.0 + +# only dep needed to run tests is pytest +pip install pytest + +# run the tests +pytest --import-mode=importlib python/tests ``` Try running one of the examples from the top-level README, or write some custom Python code to query some available diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 14c0baee8..3879a267f 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file diff --git a/dev/rust_lint.sh b/dev/rust_lint.sh index b1285cbc3..eeb9e2302 100755 --- a/dev/rust_lint.sh +++ b/dev/rust_lint.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file diff --git a/docs/build.sh b/docs/build.sh index 7e8bb0b54..5afe85812 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file diff --git a/pyproject.toml b/pyproject.toml index 4e03ce8db..6e10333a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ max-doc-length = 88 # Disable docstring checking for these directories [tool.ruff.lint.per-file-ignores] -"python/datafusion/tests/*" = ["D"] +"python/tests/*" = ["D"] "examples/*" = ["D", "W505"] "dev/*" = ["D"] "benchmarks/*" = ["D", "F"] diff --git a/python/datafusion/tests/__init__.py b/python/tests/__init__.py similarity index 100% rename from python/datafusion/tests/__init__.py rename to python/tests/__init__.py diff --git a/python/datafusion/tests/conftest.py b/python/tests/conftest.py similarity index 100% rename from python/datafusion/tests/conftest.py rename to python/tests/conftest.py diff --git a/python/datafusion/tests/data_test_context/data.json b/python/tests/data_test_context/data.json similarity index 100% rename from python/datafusion/tests/data_test_context/data.json rename to python/tests/data_test_context/data.json diff --git a/python/datafusion/tests/generic.py b/python/tests/generic.py similarity index 100% rename from python/datafusion/tests/generic.py rename to python/tests/generic.py diff --git a/python/datafusion/tests/test_aggregation.py b/python/tests/test_aggregation.py similarity index 100% rename from python/datafusion/tests/test_aggregation.py rename to python/tests/test_aggregation.py diff --git a/python/datafusion/tests/test_catalog.py b/python/tests/test_catalog.py similarity index 100% rename from python/datafusion/tests/test_catalog.py rename to python/tests/test_catalog.py diff --git a/python/datafusion/tests/test_config.py b/python/tests/test_config.py similarity index 100% rename from python/datafusion/tests/test_config.py rename to python/tests/test_config.py diff --git a/python/datafusion/tests/test_context.py b/python/tests/test_context.py similarity index 100% rename from python/datafusion/tests/test_context.py rename to python/tests/test_context.py diff --git a/python/datafusion/tests/test_dataframe.py b/python/tests/test_dataframe.py similarity index 100% rename from python/datafusion/tests/test_dataframe.py rename to python/tests/test_dataframe.py diff --git a/python/datafusion/tests/test_expr.py b/python/tests/test_expr.py similarity index 100% rename from python/datafusion/tests/test_expr.py rename to python/tests/test_expr.py diff --git a/python/datafusion/tests/test_functions.py b/python/tests/test_functions.py similarity index 100% rename from python/datafusion/tests/test_functions.py rename to python/tests/test_functions.py diff --git a/python/datafusion/tests/test_imports.py b/python/tests/test_imports.py similarity index 100% rename from python/datafusion/tests/test_imports.py rename to python/tests/test_imports.py diff --git a/python/datafusion/tests/test_indexing.py b/python/tests/test_indexing.py similarity index 100% rename from python/datafusion/tests/test_indexing.py rename to python/tests/test_indexing.py diff --git a/python/datafusion/tests/test_input.py b/python/tests/test_input.py similarity index 100% rename from python/datafusion/tests/test_input.py rename to python/tests/test_input.py diff --git a/python/datafusion/tests/test_sql.py b/python/tests/test_sql.py similarity index 100% rename from python/datafusion/tests/test_sql.py rename to python/tests/test_sql.py diff --git a/python/datafusion/tests/test_store.py b/python/tests/test_store.py similarity index 100% rename from python/datafusion/tests/test_store.py rename to python/tests/test_store.py diff --git a/python/datafusion/tests/test_substrait.py b/python/tests/test_substrait.py similarity index 100% rename from python/datafusion/tests/test_substrait.py rename to python/tests/test_substrait.py diff --git a/python/datafusion/tests/test_udaf.py b/python/tests/test_udaf.py similarity index 100% rename from python/datafusion/tests/test_udaf.py rename to python/tests/test_udaf.py diff --git a/python/datafusion/tests/test_udwf.py b/python/tests/test_udwf.py similarity index 100% rename from python/datafusion/tests/test_udwf.py rename to python/tests/test_udwf.py diff --git a/python/datafusion/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py similarity index 100% rename from python/datafusion/tests/test_wrapper_coverage.py rename to python/tests/test_wrapper_coverage.py From 976b70062cba0d81ca0dd71e0cd6fa7074be3de6 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Fri, 4 Oct 2024 09:38:16 -0500 Subject: [PATCH 047/248] chore(bench): fix create_tables.sql for tpch benchmark (#897) The `WITH HEADER ROW` and `DELIMETER` clauses are no longer in use, and upstream datafusion errors out with a message saying so. --- benchmarks/tpch/create_tables.sql | 40 ++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/benchmarks/tpch/create_tables.sql b/benchmarks/tpch/create_tables.sql index 4b2209c4b..9f3aeea20 100644 --- a/benchmarks/tpch/create_tables.sql +++ b/benchmarks/tpch/create_tables.sql @@ -13,7 +13,10 @@ CREATE EXTERNAL TABLE customer ( c_extra VARCHAR NOT NULL, ) STORED AS CSV -WITH HEADER ROW DELIMITER '|' +OPTIONS ( + format.delimiter '|', + format.has_header true +) LOCATION '$PATH/customer.csv'; CREATE EXTERNAL TABLE lineitem ( @@ -36,7 +39,10 @@ CREATE EXTERNAL TABLE lineitem ( l_extra VARCHAR NOT NULL, ) STORED AS CSV -WITH HEADER ROW DELIMITER '|' +OPTIONS ( + format.delimiter '|', + format.has_header true +) LOCATION '$PATH/lineitem.csv'; CREATE EXTERNAL TABLE nation ( @@ -47,7 +53,10 @@ CREATE EXTERNAL TABLE nation ( n_extra VARCHAR NOT NULL, ) STORED AS CSV -WITH HEADER ROW DELIMITER '|' +OPTIONS ( + format.delimiter '|', + format.has_header true +) LOCATION '$PATH/nation.csv'; CREATE EXTERNAL TABLE orders ( @@ -63,7 +72,10 @@ CREATE EXTERNAL TABLE orders ( o_extra VARCHAR NOT NULL, ) STORED AS CSV -WITH HEADER ROW DELIMITER '|' +OPTIONS ( + format.delimiter '|', + format.has_header true +) LOCATION '$PATH/orders.csv'; CREATE EXTERNAL TABLE part ( @@ -79,7 +91,10 @@ CREATE EXTERNAL TABLE part ( p_extra VARCHAR NOT NULL, ) STORED AS CSV -WITH HEADER ROW DELIMITER '|' +OPTIONS ( + format.delimiter '|', + format.has_header true +) LOCATION '$PATH/part.csv'; CREATE EXTERNAL TABLE partsupp ( @@ -91,7 +106,10 @@ CREATE EXTERNAL TABLE partsupp ( ps_extra VARCHAR NOT NULL, ) STORED AS CSV -WITH HEADER ROW DELIMITER '|' +OPTIONS ( + format.delimiter '|', + format.has_header true +) LOCATION '$PATH/partsupp.csv'; CREATE EXTERNAL TABLE region ( @@ -101,7 +119,10 @@ CREATE EXTERNAL TABLE region ( r_extra VARCHAR NOT NULL, ) STORED AS CSV -WITH HEADER ROW DELIMITER '|' +OPTIONS ( + format.delimiter '|', + format.has_header true +) LOCATION '$PATH/region.csv'; CREATE EXTERNAL TABLE supplier ( @@ -115,5 +136,8 @@ CREATE EXTERNAL TABLE supplier ( s_extra VARCHAR NOT NULL, ) STORED AS CSV -WITH HEADER ROW DELIMITER '|' +OPTIONS ( + format.delimiter '|', + format.has_header true +) LOCATION '$PATH/supplier.csv'; \ No newline at end of file From d181a3039c3b5059b08016ea3f18a5e961611aef Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 4 Oct 2024 11:29:54 -0400 Subject: [PATCH 048/248] Add physical and logical plan conversion to and from protobuf (#892) * Add physical and logical plan conversion to and from protobuf * Add wrappers for LogicalPlan and ExecutionPlan * Add unit tests for to_proto and from_proto for logical and physical plans --- Cargo.lock | 251 ++++++++++++-------------- Cargo.toml | 1 + python/datafusion/__init__.py | 4 +- python/datafusion/context.py | 12 +- python/datafusion/dataframe.py | 11 +- python/datafusion/expr.py | 8 +- python/datafusion/plan.py | 147 +++++++++++++++ python/datafusion/substrait.py | 10 +- python/datafusion/tests/test_plans.py | 42 +++++ src/physical_plan.rs | 35 +++- src/sql/logical.rs | 35 +++- 11 files changed, 398 insertions(+), 158 deletions(-) create mode 100644 python/datafusion/plan.py create mode 100644 python/datafusion/tests/test_plans.py diff --git a/Cargo.lock b/Cargo.lock index da5d19a93..0935ea9e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -178,7 +178,7 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown", + "hashbrown 0.14.5", "num", ] @@ -351,9 +351,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" +checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429" dependencies = [ "bzip2", "flate2", @@ -380,9 +380,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.82" +version = "0.1.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" +checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", @@ -406,9 +406,9 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "backtrace" @@ -515,9 +515,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.7.1" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" +checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3" [[package]] name = "bzip2" @@ -542,9 +542,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.20" +version = "1.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45bcde016d64c21da4be18b655631e5ab6d3107607e71a73a9f53eb48aae23fb" +checksum = "812acba72f0a070b003d3697490d2b55b837230ae7c6c6497f05cc2ddbb8d938" dependencies = [ "jobserver", "libc", @@ -738,7 +738,7 @@ checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", - "hashbrown", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core", @@ -781,7 +781,7 @@ dependencies = [ "futures", "glob", "half", - "hashbrown", + "hashbrown 0.14.5", "indexmap", "itertools", "log", @@ -832,7 +832,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "hashbrown", + "hashbrown 0.14.5", "instant", "libc", "num_cpus", @@ -866,7 +866,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "futures", - "hashbrown", + "hashbrown 0.14.5", "log", "object_store", "parking_lot", @@ -923,7 +923,7 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", - "hashbrown", + "hashbrown 0.14.5", "hex", "itertools", "log", @@ -1017,7 +1017,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown", + "hashbrown 0.14.5", "indexmap", "itertools", "log", @@ -1047,7 +1047,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown", + "hashbrown 0.14.5", "hex", "indexmap", "itertools", @@ -1067,7 +1067,7 @@ dependencies = [ "arrow", "datafusion-common", "datafusion-expr-common", - "hashbrown", + "hashbrown 0.14.5", "rand", ] @@ -1109,7 +1109,7 @@ dependencies = [ "datafusion-physical-expr-common", "futures", "half", - "hashbrown", + "hashbrown 0.14.5", "indexmap", "itertools", "log", @@ -1120,6 +1120,35 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-proto" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "585357d621fa03ea85a7fefca79ebc5ef0ee13a7f82be0762a414879a4d190a7" +dependencies = [ + "arrow", + "chrono", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-proto-common", + "object_store", + "prost", +] + +[[package]] +name = "datafusion-proto-common" +version = "42.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4db6534382f92f528bdb5d925b4214c31ffd84fa7fe1eff3ed0d2f1286851ab8" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "object_store", + "prost", +] + [[package]] name = "datafusion-python" version = "41.0.0" @@ -1127,6 +1156,7 @@ dependencies = [ "arrow", "async-trait", "datafusion", + "datafusion-proto", "datafusion-substrait", "futures", "mimalloc", @@ -1242,9 +1272,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.33" +version = "1.0.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" +checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" dependencies = [ "crc32fast", "miniz_oxide", @@ -1427,6 +1457,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" + [[package]] name = "heck" version = "0.4.1" @@ -1487,9 +1523,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.4" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" +checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" [[package]] name = "humantime" @@ -1528,7 +1564,7 @@ dependencies = [ "hyper", "hyper-util", "rustls", - "rustls-native-certs 0.8.0", + "rustls-native-certs", "rustls-pki-types", "tokio", "tokio-rustls", @@ -1537,9 +1573,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba" +checksum = "41296eb09f183ac68eec06e03cdbea2e759633d4067b2f6552fc2e009bcad08b" dependencies = [ "bytes", "futures-channel", @@ -1550,7 +1586,6 @@ dependencies = [ "pin-project-lite", "socket2", "tokio", - "tower", "tower-service", "tracing", ] @@ -1590,12 +1625,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.15.0", ] [[package]] @@ -1733,9 +1768,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.158" +version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" [[package]] name = "libflate" @@ -1757,7 +1792,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" dependencies = [ "core2", - "hashbrown", + "hashbrown 0.14.5", "rle-decode-fast", ] @@ -2012,9 +2047,12 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.19.0" +version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "82881c4be219ab5faaf2ad5e5e5ecdff8c66bd7402ca3160975c93b24961afd1" +dependencies = [ + "portable-atomic", +] [[package]] name = "openssl-probe" @@ -2075,7 +2113,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown", + "hashbrown 0.14.5", "lz4_flex", "num", "num-bigint", @@ -2196,26 +2234,6 @@ dependencies = [ "siphasher", ] -[[package]] -name = "pin-project" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "pin-project-lite" version = "0.2.14" @@ -2230,15 +2248,15 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "portable-atomic" -version = "1.7.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" +checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" [[package]] name = "ppv-lite86" @@ -2280,9 +2298,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8650aabb6c35b860610e9cff5dc1af886c9e25073b7b1712a68972af4281302" +checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" dependencies = [ "bytes", "heck 0.5.0", @@ -2401,9 +2419,9 @@ checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db" [[package]] name = "quick-xml" -version = "0.36.1" +version = "0.36.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96a05e2e8efddfa51a84ca47cec303fac86c8541b686d37cac5efc0e094417bc" +checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe" dependencies = [ "memchr", "serde", @@ -2498,18 +2516,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.4" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "regex" -version = "1.10.6" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", @@ -2519,9 +2537,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", @@ -2536,9 +2554,9 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "regress" @@ -2546,15 +2564,15 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eae2a1ebfecc58aff952ef8ccd364329abe627762f5bf09ff42eb9d98522479" dependencies = [ - "hashbrown", + "hashbrown 0.14.5", "memchr", ] [[package]] name = "reqwest" -version = "0.12.7" +version = "0.12.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" +checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" dependencies = [ "base64 0.22.1", "bytes", @@ -2576,7 +2594,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls", - "rustls-native-certs 0.7.3", + "rustls-native-certs", "rustls-pemfile", "rustls-pki-types", "serde", @@ -2664,19 +2682,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustls-native-certs" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" -dependencies = [ - "openssl-probe", - "rustls-pemfile", - "rustls-pki-types", - "schannel", - "security-framework", -] - [[package]] name = "rustls-native-certs" version = "0.8.0" @@ -2692,19 +2697,18 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.3" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" dependencies = [ - "base64 0.22.1", "rustls-pki-types", ] [[package]] name = "rustls-pki-types" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" +checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" [[package]] name = "rustls-webpki" @@ -2792,9 +2796,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.11.1" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75da29fe9b9b08fe9d6b22b5b4bcbc75d8db3aa31e639aa56bb62e9d46bfceaf" +checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" dependencies = [ "core-foundation-sys", "libc", @@ -2935,18 +2939,18 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "snafu" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b835cb902660db3415a672d862905e791e54d306c6e8189168c7f3d9ae1c79d" +checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" dependencies = [ "snafu-derive", ] [[package]] name = "snafu-derive" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d1e02fca405f6280643174a50c942219f0bbf4dbf7d480f1dd864d6f211ae5" +checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -3103,9 +3107,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.12.0" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" +checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" dependencies = [ "cfg-if", "fastrand", @@ -3116,18 +3120,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.63" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" +checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.63" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" +checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", @@ -3220,27 +3224,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "pin-project", - "pin-project-lite", - "tokio", - "tower-layer", - "tower-service", -] - -[[package]] -name = "tower-layer" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" - [[package]] name = "tower-service" version = "0.3.3" @@ -3369,9 +3352,9 @@ dependencies = [ [[package]] name = "unicode-bidi" -version = "0.3.15" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" +checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" [[package]] name = "unicode-ident" @@ -3396,9 +3379,9 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.13" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "unindent" @@ -3539,9 +3522,9 @@ checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "wasm-streams" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +checksum = "4e072d4e72f700fb3443d8fe94a39315df013eef1104903cdb0a2abd322bbecd" dependencies = [ "futures-util", "js-sys", diff --git a/Cargo.toml b/Cargo.toml index 54e53f4b0..a0723984f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] arrow = { version = "53", features = ["pyarrow"] } datafusion = { version = "42.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } datafusion-substrait = { version = "42.0.0", optional = true } +datafusion-proto = { version = "42.0.0" } prost = "0.13" # keep in line with `datafusion-substrait` prost-types = "0.13" # keep in line with `datafusion-substrait` uuid = { version = "1.9", features = ["v4"] } diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 4f40b2088..63c19b3e1 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -36,7 +36,7 @@ from .catalog import Catalog, Database, Table # The following imports are okay to remain as opaque to the user. -from ._internal import Config, LogicalPlan, ExecutionPlan, runtime +from ._internal import Config, runtime from .record_batch import RecordBatchStream, RecordBatch @@ -53,6 +53,8 @@ WindowFrame, ) +from .plan import LogicalPlan, ExecutionPlan + from . import functions, object_store, substrait __version__ = importlib_metadata.version(__name__) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 5b52d397b..b08e62d77 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -23,7 +23,6 @@ from ._internal import RuntimeConfig as RuntimeConfigInternal from ._internal import SQLOptions as SQLOptionsInternal from ._internal import SessionContext as SessionContextInternal -from ._internal import LogicalPlan, ExecutionPlan from datafusion.catalog import Catalog, Table from datafusion.dataframe import DataFrame @@ -39,6 +38,7 @@ import pandas import polars import pathlib + from datafusion.plan import LogicalPlan, ExecutionPlan class SessionConfig: @@ -268,8 +268,10 @@ def with_disk_manager_specified(self, *paths: str | pathlib.Path) -> RuntimeConf Returns: A new :py:class:`RuntimeConfig` object with the updated setting. """ - paths = [str(p) for p in paths] - self.config_internal = self.config_internal.with_disk_manager_specified(paths) + paths_list = [str(p) for p in paths] + self.config_internal = self.config_internal.with_disk_manager_specified( + paths_list + ) return self def with_unbounded_memory_pool(self) -> RuntimeConfig: @@ -558,7 +560,7 @@ def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame: Returns: DataFrame representation of the logical plan. """ - return DataFrame(self.ctx.create_dataframe_from_logical_plan(plan)) + return DataFrame(self.ctx.create_dataframe_from_logical_plan(plan._raw_plan)) def from_pylist( self, data: list[dict[str, Any]], name: str | None = None @@ -1034,4 +1036,4 @@ def read_table(self, table: Table) -> DataFrame: def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: """Execute the ``plan`` and return the results.""" - return RecordBatchStream(self.ctx.execute(plan, partitions)) + return RecordBatchStream(self.ctx.execute(plan._raw_plan, partitions)) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 2328ef8fa..c5ac0bb89 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -24,6 +24,7 @@ from typing import Any, List, TYPE_CHECKING from datafusion.record_batch import RecordBatchStream from typing_extensions import deprecated +from datafusion.plan import LogicalPlan, ExecutionPlan if TYPE_CHECKING: import pyarrow as pa @@ -34,10 +35,6 @@ from datafusion._internal import DataFrame as DataFrameInternal from datafusion.expr import Expr, SortExpr, sort_or_default -from datafusion._internal import ( - LogicalPlan, - ExecutionPlan, -) class DataFrame: @@ -316,7 +313,7 @@ def logical_plan(self) -> LogicalPlan: Returns: Unoptimized logical plan. """ - return self.df.logical_plan() + return LogicalPlan(self.df.logical_plan()) def optimized_logical_plan(self) -> LogicalPlan: """Return the optimized ``LogicalPlan``. @@ -324,7 +321,7 @@ def optimized_logical_plan(self) -> LogicalPlan: Returns: Optimized logical plan. """ - return self.df.optimized_logical_plan() + return LogicalPlan(self.df.optimized_logical_plan()) def execution_plan(self) -> ExecutionPlan: """Return the execution/physical plan. @@ -332,7 +329,7 @@ def execution_plan(self) -> ExecutionPlan: Returns: Execution plan. """ - return self.df.execution_plan() + return ExecutionPlan(self.df.execution_plan()) def repartition(self, num: int) -> DataFrame: """Repartition a DataFrame into ``num`` partitions. diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 152aa38d3..8600627ae 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -22,16 +22,18 @@ from __future__ import annotations -from typing import Any, Optional, Type +from typing import Any, Optional, Type, TYPE_CHECKING import pyarrow as pa from datafusion.common import DataTypeMap, NullTreatment, RexType from typing_extensions import deprecated -from ._internal import LogicalPlan from ._internal import expr as expr_internal from ._internal import functions as functions_internal +if TYPE_CHECKING: + from datafusion.plan import LogicalPlan + # The following are imported from the internal representation. We may choose to # give these all proper wrappers, or to simply leave as is. These were added # in order to support passing the `test_imports` unit test. @@ -485,7 +487,7 @@ def rex_call_operator(self) -> str: def column_name(self, plan: LogicalPlan) -> str: """Compute the output column name based on the provided logical plan.""" - return self.expr.column_name(plan) + return self.expr.column_name(plan._raw_plan) def order_by(self, *exprs: Expr | SortExpr) -> ExprFuncBuilder: """Set the ordering for a window or aggregate function. diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py new file mode 100644 index 000000000..3836edec6 --- /dev/null +++ b/python/datafusion/plan.py @@ -0,0 +1,147 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""This module supports physical and logical plans in DataFusion.""" + +from __future__ import annotations + +import datafusion._internal as df_internal + +from typing import List, Any, TYPE_CHECKING + +if TYPE_CHECKING: + from datafusion.context import SessionContext + +__all__ = [ + "LogicalPlan", + "ExecutionPlan", +] + + +class LogicalPlan: + """Logical Plan. + + A `LogicalPlan` is a node in a tree of relational operators (such as + Projection or Filter). + + Represents transforming an input relation (table) to an output relation + (table) with a potentially different schema. Plans form a dataflow tree + where data flows from leaves up to the root to produce the query result. + + `LogicalPlan`s can be created by the SQL query planner, the DataFrame API, + or programmatically (for example custom query languages). + """ + + def __init__(self, plan: df_internal.LogicalPlan) -> None: + """This constructor should not be called by the end user.""" + self._raw_plan = plan + + def to_variant(self) -> Any: + """Convert the logical plan into its specific variant.""" + return self._raw_plan.to_variant() + + def inputs(self) -> List[LogicalPlan]: + """Returns the list of inputs to the logical plan.""" + return [LogicalPlan(p) for p in self._raw_plan.inputs()] + + def __repr__(self) -> str: + """Generate a printable representation of the plan.""" + return self._raw_plan.__repr__() + + def display(self) -> str: + """Print the logical plan.""" + return self._raw_plan.display() + + def display_indent(self) -> str: + """Print an indented form of the logical plan.""" + return self._raw_plan.display_indent() + + def display_indent_schema(self) -> str: + """Print an indented form of the schema for the logical plan.""" + return self._raw_plan.display_indent_schema() + + def display_graphviz(self) -> str: + """Print the graph visualization of the logical plan. + + Returns a `format`able structure that produces lines meant for graphical display + using the `DOT` language. This format can be visualized using software from + [`graphviz`](https://graphviz.org/) + """ + return self._raw_plan.display_graphviz() + + @staticmethod + def from_proto(ctx: SessionContext, data: bytes) -> LogicalPlan: + """Create a LogicalPlan from protobuf bytes. + + Tables created in memory from record batches are currently not supported. + """ + return LogicalPlan(df_internal.LogicalPlan.from_proto(ctx.ctx, data)) + + def to_proto(self) -> bytes: + """Convert a LogicalPlan to protobuf bytes. + + Tables created in memory from record batches are currently not supported. + """ + return self._raw_plan.to_proto() + + +class ExecutionPlan: + """Represent nodes in the DataFusion Physical Plan.""" + + def __init__(self, plan: df_internal.ExecutionPlan) -> None: + """This constructor should not be called by the end user.""" + self._raw_plan = plan + + def children(self) -> List[ExecutionPlan]: + """Get a list of children `ExecutionPlan`s that act as inputs to this plan. + + The returned list will be empty for leaf nodes such as scans, will contain a + single value for unary nodes, or two values for binary nodes (such as joins). + """ + return [ExecutionPlan(e) for e in self._raw_plan.children()] + + def display(self) -> str: + """Print the physical plan.""" + return self._raw_plan.display() + + def display_indent(self) -> str: + """Print an indented form of the physical plan.""" + return self._raw_plan.display_indent() + + def __repr__(self) -> str: + """Print a string representation of the physical plan.""" + return self._raw_plan.__repr__() + + @property + def partition_count(self) -> int: + """Returns the number of partitions in the physical plan.""" + return self._raw_plan.partition_count + + @staticmethod + def from_proto(ctx: SessionContext, data: bytes) -> ExecutionPlan: + """Create an ExecutionPlan from protobuf bytes. + + Tables created in memory from record batches are currently not supported. + """ + return ExecutionPlan(df_internal.ExecutionPlan.from_proto(ctx.ctx, data)) + + def to_proto(self) -> bytes: + """Convert an ExecutionPlan into protobuf bytes. + + Tables created in memory from record batches are currently not supported. + """ + return self._raw_plan.to_proto() diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 0cdd19a51..dea47acca 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -28,10 +28,10 @@ from typing import TYPE_CHECKING from typing_extensions import deprecated import pathlib +from datafusion.plan import LogicalPlan if TYPE_CHECKING: from datafusion.context import SessionContext - from datafusion._internal import LogicalPlan __all__ = [ "Plan", @@ -156,7 +156,9 @@ def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan: Substrait plan. """ return Plan( - substrait_internal.Producer.to_substrait_plan(logical_plan, ctx.ctx) + substrait_internal.Producer.to_substrait_plan( + logical_plan._raw_plan, ctx.ctx + ) ) @@ -181,8 +183,8 @@ def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan: Returns: LogicalPlan. """ - return substrait_internal.Consumer.from_substrait_plan( - ctx.ctx, plan.plan_internal + return LogicalPlan( + substrait_internal.Consumer.from_substrait_plan(ctx.ctx, plan.plan_internal) ) diff --git a/python/datafusion/tests/test_plans.py b/python/datafusion/tests/test_plans.py new file mode 100644 index 000000000..0283a4e6a --- /dev/null +++ b/python/datafusion/tests/test_plans.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from datafusion import SessionContext, LogicalPlan, ExecutionPlan +import pytest + + +# Note: We must use CSV because memory tables are currently not supported for +# conversion to/from protobuf. +@pytest.fixture +def df(): + ctx = SessionContext() + return ctx.read_csv(path="testing/data/csv/aggregate_test_100.csv").select("c1") + + +def test_logical_plan_to_proto(ctx, df) -> None: + logical_plan_bytes = df.logical_plan().to_proto() + logical_plan = LogicalPlan.from_proto(ctx, logical_plan_bytes) + + df_round_trip = ctx.create_dataframe_from_logical_plan(logical_plan) + + assert df.collect() == df_round_trip.collect() + + original_execution_plan = df.execution_plan() + execution_plan_bytes = original_execution_plan.to_proto() + execution_plan = ExecutionPlan.from_proto(ctx, execution_plan_bytes) + + assert str(original_execution_plan) == str(execution_plan) diff --git a/src/physical_plan.rs b/src/physical_plan.rs index c97c1a96e..9ef2f0ebb 100644 --- a/src/physical_plan.rs +++ b/src/physical_plan.rs @@ -16,9 +16,13 @@ // under the License. use datafusion::physical_plan::{displayable, ExecutionPlan, ExecutionPlanProperties}; +use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; +use prost::Message; use std::sync::Arc; -use pyo3::prelude::*; +use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; + +use crate::{context::PySessionContext, errors::DataFusionError}; #[pyclass(name = "ExecutionPlan", module = "datafusion", subclass)] #[derive(Debug, Clone)] @@ -54,6 +58,35 @@ impl PyExecutionPlan { format!("{}", d.indent(false)) } + pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyResult> { + let codec = DefaultPhysicalExtensionCodec {}; + let proto = datafusion_proto::protobuf::PhysicalPlanNode::try_from_physical_plan( + self.plan.clone(), + &codec, + )?; + + let bytes = proto.encode_to_vec(); + Ok(PyBytes::new_bound(py, &bytes)) + } + + #[staticmethod] + pub fn from_proto(ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>) -> PyResult { + let bytes: &[u8] = proto_msg.extract()?; + let proto_plan = + datafusion_proto::protobuf::PhysicalPlanNode::decode(bytes).map_err(|e| { + PyRuntimeError::new_err(format!( + "Unable to decode logical node from serialized bytes: {}", + e + )) + })?; + + let codec = DefaultPhysicalExtensionCodec {}; + let plan = proto_plan + .try_into_physical_plan(&ctx.ctx, &ctx.ctx.runtime_env(), &codec) + .map_err(DataFusionError::from)?; + Ok(Self::new(plan)) + } + fn __repr__(&self) -> String { self.display_indent() } diff --git a/src/sql/logical.rs b/src/sql/logical.rs index d00f0af3f..fc398ff89 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -17,7 +17,6 @@ use std::sync::Arc; -use crate::errors::py_unsupported_variant_err; use crate::expr::aggregate::PyAggregate; use crate::expr::analyze::PyAnalyze; use crate::expr::cross_join::PyCrossJoin; @@ -35,8 +34,11 @@ use crate::expr::subquery_alias::PySubqueryAlias; use crate::expr::table_scan::PyTableScan; use crate::expr::unnest::PyUnnest; use crate::expr::window::PyWindowExpr; -use datafusion::logical_expr::LogicalPlan; -use pyo3::prelude::*; +use crate::{context::PySessionContext, errors::py_unsupported_variant_err}; +use datafusion::{error::DataFusionError, logical_expr::LogicalPlan}; +use datafusion_proto::logical_plan::{AsLogicalPlan, DefaultLogicalExtensionCodec}; +use prost::Message; +use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; use crate::expr::logical_node::LogicalNode; @@ -125,6 +127,33 @@ impl PyLogicalPlan { fn display_graphviz(&self) -> String { format!("{}", self.plan.display_graphviz()) } + + pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyResult> { + let codec = DefaultLogicalExtensionCodec {}; + let proto = + datafusion_proto::protobuf::LogicalPlanNode::try_from_logical_plan(&self.plan, &codec)?; + + let bytes = proto.encode_to_vec(); + Ok(PyBytes::new_bound(py, &bytes)) + } + + #[staticmethod] + pub fn from_proto(ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>) -> PyResult { + let bytes: &[u8] = proto_msg.extract()?; + let proto_plan = + datafusion_proto::protobuf::LogicalPlanNode::decode(bytes).map_err(|e| { + PyRuntimeError::new_err(format!( + "Unable to decode logical node from serialized bytes: {}", + e + )) + })?; + + let codec = DefaultLogicalExtensionCodec {}; + let plan = proto_plan + .try_into_logical_plan(&ctx.ctx, &codec) + .map_err(DataFusionError::from)?; + Ok(Self::new(plan)) + } } impl From for LogicalPlan { From 1fd3762ce471fe375598740af447e6747f307bdc Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 4 Oct 2024 12:26:30 -0400 Subject: [PATCH 049/248] Feature/instance udfs (#890) * Add option for passing in constructor arguments to the udaf * Fix small warnings in pylance * Improve type hinting for udaf and fix one pylance warning * Set up UDWF to take arguments as constructor just like UDAF to ensure we get a clean state when functions are reused * Improve handling of udf when user provides a class instead of bare function * Add unit tests for UDF showing callable class * Add license text * Switching to use factory methods for udaf and udwf * Move new tests to the new testing directory --- python/datafusion/udf.py | 136 +++++++++++++++----- python/tests/test_dataframe.py | 16 --- python/{datafusion => }/tests/test_plans.py | 0 python/tests/test_udaf.py | 79 +++++++----- python/tests/test_udf.py | 96 ++++++++++++++ python/tests/test_udwf.py | 36 ++++-- src/udwf.rs | 6 +- 7 files changed, 272 insertions(+), 97 deletions(-) rename python/{datafusion => }/tests/test_plans.py (100%) create mode 100644 python/tests/test_udf.py diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index bb7a90866..291ef2bae 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -23,7 +23,7 @@ from datafusion.expr import Expr from typing import Callable, TYPE_CHECKING, TypeVar from abc import ABCMeta, abstractmethod -from typing import List +from typing import List, Optional from enum import Enum import pyarrow @@ -84,9 +84,9 @@ class ScalarUDF: def __init__( self, - name: str | None, + name: Optional[str], func: Callable[..., _R], - input_types: list[pyarrow.DataType], + input_types: pyarrow.DataType | list[pyarrow.DataType], return_type: _R, volatility: Volatility | str, ) -> None: @@ -94,6 +94,8 @@ def __init__( See helper method :py:func:`udf` for argument details. """ + if isinstance(input_types, pyarrow.DataType): + input_types = [input_types] self._udf = df_internal.ScalarUDF( name, func, input_types, return_type, str(volatility) ) @@ -104,8 +106,8 @@ def __call__(self, *args: Expr) -> Expr: This function is not typically called by an end user. These calls will occur during the evaluation of the dataframe. """ - args = [arg.expr for arg in args] - return Expr(self._udf.__call__(*args)) + args_raw = [arg.expr for arg in args] + return Expr(self._udf.__call__(*args_raw)) @staticmethod def udf( @@ -113,7 +115,7 @@ def udf( input_types: list[pyarrow.DataType], return_type: _R, volatility: Volatility | str, - name: str | None = None, + name: Optional[str] = None, ) -> ScalarUDF: """Create a new User-Defined Function. @@ -133,7 +135,10 @@ def udf( if not callable(func): raise TypeError("`func` argument must be callable") if name is None: - name = func.__qualname__.lower() + if hasattr(func, "__qualname__"): + name = func.__qualname__.lower() + else: + name = func.__class__.__name__.lower() return ScalarUDF( name=name, func=func, @@ -167,10 +172,6 @@ def evaluate(self) -> pyarrow.Scalar: pass -if TYPE_CHECKING: - _A = TypeVar("_A", bound=(Callable[..., _R], Accumulator)) - - class AggregateUDF: """Class for performing scalar user-defined functions (UDF). @@ -180,10 +181,10 @@ class AggregateUDF: def __init__( self, - name: str | None, - accumulator: _A, + name: Optional[str], + accumulator: Callable[[], Accumulator], input_types: list[pyarrow.DataType], - return_type: _R, + return_type: pyarrow.DataType, state_type: list[pyarrow.DataType], volatility: Volatility | str, ) -> None: @@ -193,7 +194,12 @@ def __init__( descriptions. """ self._udaf = df_internal.AggregateUDF( - name, accumulator, input_types, return_type, state_type, str(volatility) + name, + accumulator, + input_types, + return_type, + state_type, + str(volatility), ) def __call__(self, *args: Expr) -> Expr: @@ -202,21 +208,52 @@ def __call__(self, *args: Expr) -> Expr: This function is not typically called by an end user. These calls will occur during the evaluation of the dataframe. """ - args = [arg.expr for arg in args] - return Expr(self._udaf.__call__(*args)) + args_raw = [arg.expr for arg in args] + return Expr(self._udaf.__call__(*args_raw)) @staticmethod def udaf( - accum: _A, - input_types: list[pyarrow.DataType], - return_type: _R, + accum: Callable[[], Accumulator], + input_types: pyarrow.DataType | list[pyarrow.DataType], + return_type: pyarrow.DataType, state_type: list[pyarrow.DataType], volatility: Volatility | str, - name: str | None = None, + name: Optional[str] = None, ) -> AggregateUDF: """Create a new User-Defined Aggregate Function. - The accumulator function must be callable and implement :py:class:`Accumulator`. + If your :py:class:`Accumulator` can be instantiated with no arguments, you + can simply pass it's type as ``accum``. If you need to pass additional arguments + to it's constructor, you can define a lambda or a factory method. During runtime + the :py:class:`Accumulator` will be constructed for every instance in + which this UDAF is used. The following examples are all valid. + + .. code-block:: python + import pyarrow as pa + import pyarrow.compute as pc + + class Summarize(Accumulator): + def __init__(self, bias: float = 0.0): + self._sum = pa.scalar(bias) + + def state(self) -> List[pa.Scalar]: + return [self._sum] + + def update(self, values: pa.Array) -> None: + self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) + + def merge(self, states: List[pa.Array]) -> None: + self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) + + def evaluate(self) -> pa.Scalar: + return self._sum + + def sum_bias_10() -> Summarize: + return Summarize(10.0) + + udaf1 = udaf(Summarize, pa.float64(), pa.float64(), [pa.float64()], "immutable") + udaf2 = udaf(sum_bias_10, pa.float64(), pa.float64(), [pa.float64()], "immutable") + udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), [pa.float64()], "immutable") Args: accum: The accumulator python function. @@ -229,14 +266,16 @@ def udaf( Returns: A user-defined aggregate function, which can be used in either data aggregation or window function calls. - """ - if not issubclass(accum, Accumulator): + """ # noqa W505 + if not callable(accum): + raise TypeError("`func` must be callable.") + if not isinstance(accum.__call__(), Accumulator): raise TypeError( - "`accum` must implement the abstract base class Accumulator" + "Accumulator must implement the abstract base class Accumulator" ) if name is None: - name = accum.__qualname__.lower() - if isinstance(input_types, pyarrow.lib.DataType): + name = accum.__call__().__class__.__qualname__.lower() + if isinstance(input_types, pyarrow.DataType): input_types = [input_types] return AggregateUDF( name=name, @@ -421,8 +460,8 @@ class WindowUDF: def __init__( self, - name: str | None, - func: WindowEvaluator, + name: Optional[str], + func: Callable[[], WindowEvaluator], input_types: list[pyarrow.DataType], return_type: pyarrow.DataType, volatility: Volatility | str, @@ -447,30 +486,57 @@ def __call__(self, *args: Expr) -> Expr: @staticmethod def udwf( - func: WindowEvaluator, + func: Callable[[], WindowEvaluator], input_types: pyarrow.DataType | list[pyarrow.DataType], return_type: pyarrow.DataType, volatility: Volatility | str, - name: str | None = None, + name: Optional[str] = None, ) -> WindowUDF: """Create a new User-Defined Window Function. + If your :py:class:`WindowEvaluator` can be instantiated with no arguments, you + can simply pass it's type as ``func``. If you need to pass additional arguments + to it's constructor, you can define a lambda or a factory method. During runtime + the :py:class:`WindowEvaluator` will be constructed for every instance in + which this UDWF is used. The following examples are all valid. + + .. code-block:: python + + import pyarrow as pa + + class BiasedNumbers(WindowEvaluator): + def __init__(self, start: int = 0) -> None: + self.start = start + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + return pa.array([self.start + i for i in range(num_rows)]) + + def bias_10() -> BiasedNumbers: + return BiasedNumbers(10) + + udwf1 = udwf(BiasedNumbers, pa.int64(), pa.int64(), "immutable") + udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable") + udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable") + Args: - func: The python function. + func: A callable to create the window function. input_types: The data types of the arguments to ``func``. return_type: The data type of the return value. volatility: See :py:class:`Volatility` for allowed values. + arguments: A list of arguments to pass in to the __init__ method for accum. name: A descriptive name for the function. Returns: A user-defined window function. - """ - if not isinstance(func, WindowEvaluator): + """ # noqa W505 + if not callable(func): + raise TypeError("`func` must be callable.") + if not isinstance(func.__call__(), WindowEvaluator): raise TypeError( "`func` must implement the abstract base class WindowEvaluator" ) if name is None: - name = func.__class__.__qualname__.lower() + name = func.__call__().__class__.__qualname__.lower() if isinstance(input_types, pyarrow.DataType): input_types = [input_types] return WindowUDF( diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index ad7f728b4..e89c57159 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -29,7 +29,6 @@ WindowFrame, column, literal, - udf, ) from datafusion.expr import Window @@ -236,21 +235,6 @@ def test_unnest_without_nulls(nested_df): assert result.column(1) == pa.array([7, 8, 8, 9, 9, 9]) -def test_udf(df): - # is_null is a pa function over arrays - is_null = udf( - lambda x: x.is_null(), - [pa.int64()], - pa.bool_(), - volatility="immutable", - ) - - df = df.select(is_null(column("a"))) - result = df.collect()[0].column(0) - - assert result == pa.array([False, False, False]) - - def test_join(): ctx = SessionContext() diff --git a/python/datafusion/tests/test_plans.py b/python/tests/test_plans.py similarity index 100% rename from python/datafusion/tests/test_plans.py rename to python/tests/test_plans.py diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index 6f2525b0f..8f31748e0 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -21,14 +21,14 @@ import pyarrow.compute as pc import pytest -from datafusion import Accumulator, column, udaf, udf +from datafusion import Accumulator, column, udaf class Summarize(Accumulator): """Interface of a user-defined accumulation.""" - def __init__(self): - self._sum = pa.scalar(0.0) + def __init__(self, initial_value: float = 0.0): + self._sum = pa.scalar(initial_value) def state(self) -> List[pa.Scalar]: return [self._sum] @@ -79,25 +79,22 @@ def test_errors(df): volatility="immutable", ) - accum = udaf( - MissingMethods, - pa.int64(), - pa.int64(), - [pa.int64()], - volatility="immutable", - ) - df = df.aggregate([], [accum(column("a"))]) - msg = ( "Can't instantiate abstract class MissingMethods (without an implementation " "for abstract methods 'evaluate', 'merge', 'update'|with abstract methods " "evaluate, merge, update)" ) with pytest.raises(Exception, match=msg): - df.collect() + accum = udaf( # noqa F841 + MissingMethods, + pa.int64(), + pa.int64(), + [pa.int64()], + volatility="immutable", + ) -def test_aggregate(df): +def test_udaf_aggregate(df): summarize = udaf( Summarize, pa.float64(), @@ -106,13 +103,46 @@ def test_aggregate(df): volatility="immutable", ) - df = df.aggregate([], [summarize(column("a"))]) + df1 = df.aggregate([], [summarize(column("a"))]) # execute and collect the first (and only) batch - result = df.collect()[0] + result = df1.collect()[0] assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) + df2 = df.aggregate([], [summarize(column("a"))]) + + # Run a second time to ensure the state is properly reset + result = df2.collect()[0] + + assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) + + +def test_udaf_aggregate_with_arguments(df): + bias = 10.0 + + summarize = udaf( + lambda: Summarize(bias), + pa.float64(), + pa.float64(), + [pa.float64()], + volatility="immutable", + ) + + df1 = df.aggregate([], [summarize(column("a"))]) + + # execute and collect the first (and only) batch + result = df1.collect()[0] + + assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) + + df2 = df.aggregate([], [summarize(column("a"))]) + + # Run a second time to ensure the state is properly reset + result = df2.collect()[0] + + assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) + def test_group_by(df): summarize = udaf( @@ -146,20 +176,3 @@ def test_register_udaf(ctx, df) -> None: df_result = ctx.sql("select summarize(b) from test_table") assert df_result.collect()[0][0][0].as_py() == 14.0 - - -def test_register_udf(ctx, df) -> None: - is_null = udf( - lambda x: x.is_null(), - [pa.float64()], - pa.bool_(), - volatility="immutable", - name="is_null", - ) - - ctx.register_udf(is_null) - - df_result = ctx.sql("select is_null(a) from test_table") - result = df_result.collect()[0].column(0) - - assert result == pa.array([False, False, False]) diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py new file mode 100644 index 000000000..568a66dbb --- /dev/null +++ b/python/tests/test_udf.py @@ -0,0 +1,96 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from datafusion import udf, column +import pyarrow as pa +import pytest + + +@pytest.fixture +def df(ctx): + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 4, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]], name="test_table") + + +def test_udf(df): + # is_null is a pa function over arrays + is_null = udf( + lambda x: x.is_null(), + [pa.int64()], + pa.bool_(), + volatility="immutable", + ) + + df = df.select(is_null(column("a"))) + result = df.collect()[0].column(0) + + assert result == pa.array([False, False, False]) + + +def test_register_udf(ctx, df) -> None: + is_null = udf( + lambda x: x.is_null(), + [pa.float64()], + pa.bool_(), + volatility="immutable", + name="is_null", + ) + + ctx.register_udf(is_null) + + df_result = ctx.sql("select is_null(a) from test_table") + result = df_result.collect()[0].column(0) + + assert result == pa.array([False, False, False]) + + +class OverThresholdUDF: + def __init__(self, threshold: int = 0) -> None: + self.threshold = threshold + + def __call__(self, values: pa.Array) -> pa.Array: + return pa.array(v.as_py() >= self.threshold for v in values) + + +def test_udf_with_parameters(df) -> None: + udf_no_param = udf( + OverThresholdUDF(), + pa.int64(), + pa.bool_(), + volatility="immutable", + ) + + df1 = df.select(udf_no_param(column("a"))) + result = df1.collect()[0].column(0) + + assert result == pa.array([True, True, True]) + + udf_with_param = udf( + OverThresholdUDF(2), + pa.int64(), + pa.bool_(), + volatility="immutable", + ) + + df2 = df.select(udf_with_param(column("a"))) + result = df2.collect()[0].column(0) + + assert result == pa.array([False, True, True]) diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 67c0979fe..2099ac9bc 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -24,7 +24,7 @@ class ExponentialSmoothDefault(WindowEvaluator): - def __init__(self, alpha: float) -> None: + def __init__(self, alpha: float = 0.9) -> None: self.alpha = alpha def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: @@ -44,7 +44,7 @@ def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: class ExponentialSmoothBounded(WindowEvaluator): - def __init__(self, alpha: float) -> None: + def __init__(self, alpha: float = 0.9) -> None: self.alpha = alpha def supports_bounded_execution(self) -> bool: @@ -75,7 +75,7 @@ def evaluate( class ExponentialSmoothRank(WindowEvaluator): - def __init__(self, alpha: float) -> None: + def __init__(self, alpha: float = 0.9) -> None: self.alpha = alpha def include_rank(self) -> bool: @@ -101,7 +101,7 @@ def evaluate_all_with_rank( class ExponentialSmoothFrame(WindowEvaluator): - def __init__(self, alpha: float) -> None: + def __init__(self, alpha: float = 0.9) -> None: self.alpha = alpha def uses_window_frame(self) -> bool: @@ -134,7 +134,7 @@ class SmoothTwoColumn(WindowEvaluator): the previous and next rows. """ - def __init__(self, alpha: float) -> None: + def __init__(self, alpha: float = 0.9) -> None: self.alpha = alpha def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: @@ -183,7 +183,7 @@ def df(): def test_udwf_errors(df): with pytest.raises(TypeError): udwf( - NotSubclassOfWindowEvaluator(), + NotSubclassOfWindowEvaluator, pa.float64(), pa.float64(), volatility="immutable", @@ -191,35 +191,42 @@ def test_udwf_errors(df): smooth_default = udwf( - ExponentialSmoothDefault(0.9), + ExponentialSmoothDefault, + pa.float64(), + pa.float64(), + volatility="immutable", +) + +smooth_w_arguments = udwf( + lambda: ExponentialSmoothDefault(0.8), pa.float64(), pa.float64(), volatility="immutable", ) smooth_bounded = udwf( - ExponentialSmoothBounded(0.9), + ExponentialSmoothBounded, pa.float64(), pa.float64(), volatility="immutable", ) smooth_rank = udwf( - ExponentialSmoothRank(0.9), + ExponentialSmoothRank, pa.utf8(), pa.float64(), volatility="immutable", ) smooth_frame = udwf( - ExponentialSmoothFrame(0.9), + ExponentialSmoothFrame, pa.float64(), pa.float64(), volatility="immutable", ) smooth_two_col = udwf( - SmoothTwoColumn(0.9), + SmoothTwoColumn, [pa.int64(), pa.int64()], pa.float64(), volatility="immutable", @@ -227,10 +234,15 @@ def test_udwf_errors(df): data_test_udwf_functions = [ ( - "default_udwf", + "default_udwf_no_arguments", smooth_default(column("a")), [0, 0.9, 1.89, 2.889, 3.889, 4.889, 5.889], ), + ( + "default_udwf_w_arguments", + smooth_w_arguments(column("a")), + [0, 0.8, 1.76, 2.752, 3.75, 4.75, 5.75], + ), ( "default_udwf_partitioned", smooth_default(column("a")).partition_by(column("c")).build(), diff --git a/src/udwf.rs b/src/udwf.rs index 31cc5e60e..43c21ec7b 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -197,7 +197,11 @@ impl PartitionEvaluator for RustPartitionEvaluator { pub fn to_rust_partition_evaluator(evaluator: PyObject) -> PartitionEvaluatorFactory { Arc::new(move || -> Result> { - let evaluator = Python::with_gil(|py| evaluator.clone_ref(py)); + let evaluator = Python::with_gil(|py| { + evaluator + .call0(py) + .map_err(|e| DataFusionError::Execution(e.to_string())) + })?; Ok(Box::new(RustPartitionEvaluator::new(evaluator))) }) } From 108221413cc15caeddb0a266028cbc824b5b46fd Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Fri, 4 Oct 2024 12:26:41 -0500 Subject: [PATCH 050/248] chore(ci): remove Mambaforge variant from CI (#894) * migrate away from mambaforge in CI It is being sunsetted and will stop working entirely in Jan 2025. Ref: https://conda-forge.org/news/2024/07/29/sunsetting-mambaforge/ * add conda-build to conda workflow --- .github/workflows/conda.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 52888cb14..c2b8fab02 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -70,17 +70,16 @@ jobs: - name: Set up Python uses: conda-incubator/setup-miniconda@v3.0.4 with: - miniforge-variant: Mambaforge - use-mamba: true + miniforge-variant: Miniforge3 python-version: "3.8" channel-priority: strict - name: Install dependencies run: | - mamba install -c conda-forge boa conda-verify + conda install -c conda-forge conda-build conda-verify which python pip list - mamba list + conda list # Clean the conda cache - name: Clean Conda Cache run: conda clean --all --yes @@ -89,7 +88,7 @@ jobs: # suffix for nightly package versions export VERSION_SUFFIX=a`date +%y%m%d` - conda mambabuild conda/recipes \ + conda build conda/recipes \ --python ${{ matrix.python }} \ --variants "{target_platform: [${{ matrix.arch }}]}" \ --error-overlinking \ @@ -99,7 +98,7 @@ jobs: - name: Test conda packages if: matrix.arch == 'linux-64' # can only test native platform packages run: | - conda mambabuild --test packages/${{ matrix.arch }}/*.tar.bz2 + conda build --test packages/${{ matrix.arch }}/*.tar.bz2 - name: Upload conda packages as artifacts uses: actions/upload-artifact@v4 with: From ec8246da3b45e766fe6fb515ade01e0bae73af98 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Sat, 5 Oct 2024 09:57:22 -0500 Subject: [PATCH 051/248] Use OnceLock to store TokioRuntime (#895) * bump rust-version to match upstream datafusion * use std::sync::OnceLock to store tokio runtime instead of round-tripping to python * stop exporting TokioRuntime to python * remove unused argument from get_tokio_runtime * remove superflous Arc from get_tokio_runtime * add #[inline] annotation to get_tokio_runtime I also included a reference comment in case future users experience problems with using datafusion-python behind a forking app server l ike `gunicorn`. * fix clippy lint * cargo fmt --- Cargo.toml | 2 +- python/datafusion/__init__.py | 3 +-- src/context.rs | 2 +- src/dataframe.rs | 4 ++-- src/lib.rs | 6 ------ src/utils.rs | 23 +++++++++++------------ 6 files changed, 16 insertions(+), 24 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a0723984f..4f2602316 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,7 +25,7 @@ description = "Apache DataFusion DataFrame and SQL Query Engine" readme = "README.md" license = "Apache-2.0" edition = "2021" -rust-version = "1.64" +rust-version = "1.78" include = ["/src", "/datafusion", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"] [features] diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 63c19b3e1..e0bc57f44 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -36,7 +36,7 @@ from .catalog import Catalog, Database, Table # The following imports are okay to remain as opaque to the user. -from ._internal import Config, runtime +from ._internal import Config from .record_batch import RecordBatchStream, RecordBatch @@ -75,7 +75,6 @@ "literal", "lit", "DFSchema", - "runtime", "Catalog", "Database", "Table", diff --git a/src/context.rs b/src/context.rs index fde442ce4..5317a3eda 100644 --- a/src/context.rs +++ b/src/context.rs @@ -982,7 +982,7 @@ impl PySessionContext { ) -> PyResult { let ctx: TaskContext = TaskContext::from(&self.ctx.state()); // create a Tokio runtime to run the async code - let rt = &get_tokio_runtime(py).0; + let rt = &get_tokio_runtime().0; let plan = plan.plan.clone(); let fut: JoinHandle> = rt.spawn(async move { plan.execute(part, Arc::new(ctx)) }); diff --git a/src/dataframe.rs b/src/dataframe.rs index 1f7f2e643..e77ca8425 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -543,7 +543,7 @@ impl PyDataFrame { fn execute_stream(&self, py: Python) -> PyResult { // create a Tokio runtime to run the async code - let rt = &get_tokio_runtime(py).0; + let rt = &get_tokio_runtime().0; let df = self.df.as_ref().clone(); let fut: JoinHandle> = rt.spawn(async move { df.execute_stream().await }); @@ -553,7 +553,7 @@ impl PyDataFrame { fn execute_stream_partitioned(&self, py: Python) -> PyResult> { // create a Tokio runtime to run the async code - let rt = &get_tokio_runtime(py).0; + let rt = &get_tokio_runtime().0; let df = self.df.as_ref().clone(); let fut: JoinHandle>> = rt.spawn(async move { df.execute_stream_partitioned().await }); diff --git a/src/lib.rs b/src/lib.rs index 98821833d..0b57e0999 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -66,7 +66,6 @@ pub mod utils; static GLOBAL: MiMalloc = MiMalloc; // Used to define Tokio Runtime as a Python module attribute -#[pyclass] pub(crate) struct TokioRuntime(tokio::runtime::Runtime); /// Low-level DataFusion internal package. @@ -75,11 +74,6 @@ pub(crate) struct TokioRuntime(tokio::runtime::Runtime); /// datafusion directory. #[pymodule] fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { - // Register the Tokio Runtime as a module attribute so we can reuse it - m.add( - "runtime", - TokioRuntime(tokio::runtime::Runtime::new().unwrap()), - )?; // Register the python classes m.add_class::()?; m.add_class::()?; diff --git a/src/utils.rs b/src/utils.rs index 0d72eaf75..7fb23cafe 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -20,20 +20,19 @@ use crate::TokioRuntime; use datafusion::logical_expr::Volatility; use pyo3::prelude::*; use std::future::Future; +use std::sync::OnceLock; use tokio::runtime::Runtime; /// Utility to get the Tokio Runtime from Python -pub(crate) fn get_tokio_runtime(py: Python) -> PyRef { - let datafusion = py.import_bound("datafusion._internal").unwrap(); - let tmp = datafusion.getattr("runtime").unwrap(); - match tmp.extract::>() { - Ok(runtime) => runtime, - Err(_e) => { - let rt = TokioRuntime(tokio::runtime::Runtime::new().unwrap()); - let obj: Bound<'_, TokioRuntime> = Py::new(py, rt).unwrap().into_bound(py); - obj.extract().unwrap() - } - } +#[inline] +pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { + // NOTE: Other pyo3 python libraries have had issues with using tokio + // behind a forking app-server like `gunicorn` + // If we run into that problem, in the future we can look to `delta-rs` + // which adds a check in that disallows calls from a forked process + // https://github.com/delta-io/delta-rs/blob/87010461cfe01563d91a4b9cd6fa468e2ad5f283/python/src/utils.rs#L10-L31 + static RUNTIME: OnceLock = OnceLock::new(); + RUNTIME.get_or_init(|| TokioRuntime(tokio::runtime::Runtime::new().unwrap())) } /// Utility to collect rust futures with GIL released @@ -42,7 +41,7 @@ where F: Future + Send, F::Output: Send, { - let runtime: &Runtime = &get_tokio_runtime(py).0; + let runtime: &Runtime = &get_tokio_runtime().0; py.allow_threads(|| runtime.block_on(f)) } From 6c8292f8c6c67d79868a87bc16e0b06ac14d7f0b Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Sun, 6 Oct 2024 23:41:45 +0200 Subject: [PATCH 052/248] feat: expose http object store (#885) * feat: expose HTTP ObjectStore The objective is to allow the user to register CSV directly from an HTTP URL, delaying downloading the file until required * chore: return PyResult --- Cargo.toml | 2 +- python/datafusion/context.py | 4 +++- python/datafusion/object_store.py | 12 ++--------- python/tests/test_sql.py | 12 +++++++++-- python/tests/test_store.py | 10 ++------- python/tests/test_wrapper_coverage.py | 2 +- src/context.rs | 1 + src/store.rs | 31 +++++++++++++++++++++++++++ 8 files changed, 51 insertions(+), 23 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4f2602316..d2d3e79ed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,7 +47,7 @@ uuid = { version = "1.9", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } async-trait = "0.1" futures = "0.3" -object_store = { version = "0.11.0", features = ["aws", "gcp", "azure"] } +object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] } parking_lot = "0.12" regex-syntax = "0.8" syn = "2.0.79" diff --git a/python/datafusion/context.py b/python/datafusion/context.py index b08e62d77..957d7e311 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -450,7 +450,9 @@ def __init__( self.ctx = SessionContextInternal(config, runtime) - def register_object_store(self, schema: str, store: Any, host: str | None) -> None: + def register_object_store( + self, schema: str, store: Any, host: str | None = None + ) -> None: """Add a new object store into the session. Args: diff --git a/python/datafusion/object_store.py b/python/datafusion/object_store.py index c927e7614..7cc17506f 100644 --- a/python/datafusion/object_store.py +++ b/python/datafusion/object_store.py @@ -22,14 +22,6 @@ GoogleCloud = object_store.GoogleCloud LocalFileSystem = object_store.LocalFileSystem MicrosoftAzure = object_store.MicrosoftAzure +Http = object_store.Http -__all__ = [ - "AmazonS3", - "GoogleCloud", - "LocalFileSystem", - "MicrosoftAzure", -] - - -def __getattr__(name): - return getattr(object_store, name) +__all__ = ["AmazonS3", "GoogleCloud", "LocalFileSystem", "MicrosoftAzure", "Http"] diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index e39a9f5c7..39e5ffe6d 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -22,7 +22,7 @@ from pyarrow.csv import write_csv import pyarrow.dataset as ds import pytest -from datafusion.object_store import LocalFileSystem +from datafusion.object_store import Http from datafusion import udf, col @@ -139,6 +139,15 @@ def test_register_csv_list(ctx, tmp_path): assert int_sum == 2 * sum(int_values) +def test_register_http_csv(ctx): + url = "https://raw.githubusercontent.com/ibis-project/testing-data/refs/heads/master/csv/diamonds.csv" + ctx.register_object_store("", Http(url)) + ctx.register_csv("remote", url) + assert ctx.table_exist("remote") + res, *_ = ctx.sql("SELECT COUNT(*) AS total FROM remote").to_pylist() + assert res["total"] > 0 + + def test_register_parquet(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) ctx.register_parquet("t", path) @@ -494,7 +503,6 @@ def test_register_listing_table( dir_root = f"file://{dir_root}/" if path_to_str else dir_root - ctx.register_object_store("file://local", LocalFileSystem(), None) ctx.register_listing_table( "my_table", dir_root, diff --git a/python/tests/test_store.py b/python/tests/test_store.py index 3ffd9ee49..f85b28311 100644 --- a/python/tests/test_store.py +++ b/python/tests/test_store.py @@ -16,21 +16,15 @@ # under the License. import os + import pytest from datafusion import SessionContext -from datafusion.object_store import LocalFileSystem - - -@pytest.fixture -def local(): - return LocalFileSystem() @pytest.fixture -def ctx(local): +def ctx(): ctx = SessionContext() - ctx.register_object_store("file://local", local, None) return ctx diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index c53a89c59..86f2d57f2 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -55,7 +55,7 @@ def missing_exports(internal_obj, wrapped_obj) -> None: def test_datafusion_missing_exports() -> None: - """Check for any missing pythone exports. + """Check for any missing python exports. This test verifies that every exposed class, attribute, and function in the internal (pyo3) module is also exposed in our python wrappers. diff --git a/src/context.rs b/src/context.rs index 5317a3eda..f445874d6 100644 --- a/src/context.rs +++ b/src/context.rs @@ -312,6 +312,7 @@ impl PySessionContext { StorageContexts::GoogleCloudStorage(gcs) => (gcs.inner, gcs.bucket_name), StorageContexts::MicrosoftAzure(azure) => (azure.inner, azure.container_name), StorageContexts::LocalFileSystem(local) => (local.inner, "".to_string()), + StorageContexts::HTTP(http) => (http.store, http.url), }; // let users override the host to match the api signature from upstream diff --git a/src/store.rs b/src/store.rs index 846d96a6d..1e5fab472 100644 --- a/src/store.rs +++ b/src/store.rs @@ -22,7 +22,10 @@ use pyo3::prelude::*; use object_store::aws::{AmazonS3, AmazonS3Builder}; use object_store::azure::{MicrosoftAzure, MicrosoftAzureBuilder}; use object_store::gcp::{GoogleCloudStorage, GoogleCloudStorageBuilder}; +use object_store::http::{HttpBuilder, HttpStore}; use object_store::local::LocalFileSystem; +use pyo3::exceptions::PyValueError; +use url::Url; #[derive(FromPyObject)] pub enum StorageContexts { @@ -30,6 +33,7 @@ pub enum StorageContexts { GoogleCloudStorage(PyGoogleCloudContext), MicrosoftAzure(PyMicrosoftAzureContext), LocalFileSystem(PyLocalFileSystemContext), + HTTP(PyHttpContext), } #[pyclass(name = "LocalFileSystem", module = "datafusion.store", subclass)] @@ -219,10 +223,37 @@ impl PyAmazonS3Context { } } +#[pyclass(name = "Http", module = "datafusion.store", subclass)] +#[derive(Debug, Clone)] +pub struct PyHttpContext { + pub url: String, + pub store: Arc, +} + +#[pymethods] +impl PyHttpContext { + #[new] + fn new(url: String) -> PyResult { + let store = match Url::parse(url.as_str()) { + Ok(url) => HttpBuilder::new() + .with_url(url.origin().ascii_serialization()) + .build(), + Err(_) => HttpBuilder::new().build(), + } + .map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string())))?; + + Ok(Self { + url, + store: Arc::new(store), + }) + } +} + pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; Ok(()) } From af2d66581817c21048ad77729069b7af40b4ba98 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 7 Oct 2024 07:27:34 -0600 Subject: [PATCH 053/248] Update version and generate changelog (#901) --- CHANGELOG.md | 49 ++++++++++++++- Cargo.lock | 170 ++++++++++++++++++++++++--------------------------- Cargo.toml | 8 +-- 3 files changed, 131 insertions(+), 96 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56a2bfc79..ae3a2348a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,54 @@ # DataFusion Python Changelog +## [42.0.0](https://github.com/apache/datafusion-python/tree/42.0.0) (2024-10-06) + +This release consists of 20 commits from 6 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: expose between [#868](https://github.com/apache/datafusion-python/pull/868) (mesejo) +- feat: make register_csv accept a list of paths [#883](https://github.com/apache/datafusion-python/pull/883) (mesejo) +- feat: expose http object store [#885](https://github.com/apache/datafusion-python/pull/885) (mesejo) + +**Fixed bugs:** + +- fix: Calling `count` on a pyarrow dataset results in an error [#843](https://github.com/apache/datafusion-python/pull/843) (Michael-J-Ward) + +**Other:** + +- Upgrade datafusion [#867](https://github.com/apache/datafusion-python/pull/867) (emgeee) +- Feature/aggregates as windows [#871](https://github.com/apache/datafusion-python/pull/871) (timsaucer) +- Fix regression on register_udaf [#878](https://github.com/apache/datafusion-python/pull/878) (timsaucer) +- build(deps): upgrade setup-protoc action and protoc version number [#873](https://github.com/apache/datafusion-python/pull/873) (Michael-J-Ward) +- build(deps): bump prost-types from 0.13.2 to 0.13.3 [#881](https://github.com/apache/datafusion-python/pull/881) (dependabot[bot]) +- build(deps): bump prost from 0.13.2 to 0.13.3 [#882](https://github.com/apache/datafusion-python/pull/882) (dependabot[bot]) +- chore: remove XFAIL from passing tests [#884](https://github.com/apache/datafusion-python/pull/884) (Michael-J-Ward) +- Add user defined window function support [#880](https://github.com/apache/datafusion-python/pull/880) (timsaucer) +- build(deps): bump syn from 2.0.77 to 2.0.79 [#886](https://github.com/apache/datafusion-python/pull/886) (dependabot[bot]) +- fix example of reading parquet from s3 [#896](https://github.com/apache/datafusion-python/pull/896) (sir-sigurd) +- release-testing [#889](https://github.com/apache/datafusion-python/pull/889) (Michael-J-Ward) +- chore(bench): fix create_tables.sql for tpch benchmark [#897](https://github.com/apache/datafusion-python/pull/897) (Michael-J-Ward) +- Add physical and logical plan conversion to and from protobuf [#892](https://github.com/apache/datafusion-python/pull/892) (timsaucer) +- Feature/instance udfs [#890](https://github.com/apache/datafusion-python/pull/890) (timsaucer) +- chore(ci): remove Mambaforge variant from CI [#894](https://github.com/apache/datafusion-python/pull/894) (Michael-J-Ward) +- Use OnceLock to store TokioRuntime [#895](https://github.com/apache/datafusion-python/pull/895) (Michael-J-Ward) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Michael J Ward + 5 Tim Saucer + 3 Daniel Mesejo + 3 dependabot[bot] + 1 Matt Green + 1 Sergey Fedoseev +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + ## [41.0.0](https://github.com/apache/datafusion-python/tree/41.0.0) (2024-09-09) This release consists of 19 commits from 6 contributors. See credits at the end of this changelog for more information. @@ -63,7 +111,6 @@ Thank you to everyone who contributed to this release. Here is a breakdown of co Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. - ## [40.0.0](https://github.com/apache/datafusion-python/tree/40.0.0) (2024-08-09) This release consists of 18 commits from 4 contributors. See credits at the end of this changelog for more information. diff --git a/Cargo.lock b/Cargo.lock index 0935ea9e0..815323bf4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5fb1d8e4442bd405fdfd1dacb42792696b0cf9cb15882e5d097b742a676d375" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] @@ -130,9 +130,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45aef0d9cf9a039bf6cd1acc451b137aca819977b0928dece52bd92811b640ba" +checksum = "a9ba0d7248932f4e2a12fb37f0a2e3ec82b3bdedbac2a1dce186e036843b8f8c" dependencies = [ "arrow-arith", "arrow-array", @@ -152,9 +152,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03675e42d1560790f3524800e41403b40d0da1c793fe9528929fde06d8c7649a" +checksum = "d60afcdc004841a5c8d8da4f4fa22d64eb19c0c01ef4bcedd77f175a7cf6e38f" dependencies = [ "arrow-array", "arrow-buffer", @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd2bf348cf9f02a5975c5962c7fa6dee107a2009a7b41ac5fb1a027e12dc033f" +checksum = "7f16835e8599dbbb1659fd869d865254c4cf32c6c2bb60b6942ac9fc36bfa5da" dependencies = [ "ahash", "arrow-buffer", @@ -184,9 +184,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3092e37715f168976012ce52273c3989b5793b0db5f06cbaa246be25e5f0924d" +checksum = "1a1f34f0faae77da6b142db61deba2cb6d60167592b178be317b341440acba80" dependencies = [ "bytes", "half", @@ -195,9 +195,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ce1018bb710d502f9db06af026ed3561552e493e989a79d0d0f5d9cf267a785" +checksum = "450e4abb5775bca0740bec0bcf1b1a5ae07eff43bd625661c4436d8e8e4540c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -216,9 +216,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd178575f45624d045e4ebee714e246a05d9652e41363ee3f57ec18cca97f740" +checksum = "d3a4e4d63830a341713e35d9a42452fbc6241d5f42fa5cf6a4681b8ad91370c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -235,9 +235,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4ac0c4ee79150afe067dc4857154b3ee9c1cd52b5f40d59a77306d0ed18d65" +checksum = "2b1e618bbf714c7a9e8d97203c806734f012ff71ae3adc8ad1b075689f540634" dependencies = [ "arrow-buffer", "arrow-schema", @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb307482348a1267f91b0912e962cd53440e5de0f7fb24c5f7b10da70b38c94a" +checksum = "f98e983549259a2b97049af7edfb8f28b8911682040e99a94e4ceb1196bd65c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -262,9 +262,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24805ba326758effdd6f2cbdd482fcfab749544f21b134701add25b33f474e6" +checksum = "b198b9c6fcf086501730efbbcb483317b39330a116125af7bb06467d04b352a3" dependencies = [ "arrow-array", "arrow-buffer", @@ -282,9 +282,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "644046c479d80ae8ed02a7f1e1399072ea344ca6a7b0e293ab2d5d9ed924aa3b" +checksum = "2427f37b4459a4b9e533045abe87a5183a5e0995a3fc2c2fd45027ae2cc4ef3f" dependencies = [ "arrow-array", "arrow-buffer", @@ -297,9 +297,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a29791f8eb13b340ce35525b723f5f0df17ecb955599e11f65c2a94ab34e2efb" +checksum = "15959657d92e2261a7a323517640af87f5afd9fd8a6492e424ebee2203c567f6" dependencies = [ "ahash", "arrow-array", @@ -311,18 +311,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c85320a3a2facf2b2822b57aa9d6d9d55edb8aee0b6b5d3b8df158e503d10858" +checksum = "fbf0388a18fd7f7f3fe3de01852d30f54ed5182f9004db700fbe3ba843ed2794" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "arrow-select" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cc7e6b582e23855fd1625ce46e51647aa440c20ea2e71b1d748e0839dd73cba" +checksum = "b83e5723d307a38bf00ecd2972cd078d1339c7fd3eb044f609958a9a24463f3a" dependencies = [ "ahash", "arrow-array", @@ -334,9 +334,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0775b6567c66e56ded19b87a954b6b1beffbdd784ef95a3a2b03f59570c1d230" +checksum = "7ab3db7c09dd826e74079661d84ed01ed06547cf75d52c2818ef776d0d852305" dependencies = [ "arrow-array", "arrow-buffer", @@ -542,9 +542,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.24" +version = "1.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812acba72f0a070b003d3697490d2b55b837230ae7c6c6497f05cc2ddbb8d938" +checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1" dependencies = [ "jobserver", "libc", @@ -572,9 +572,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb" +checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" dependencies = [ "chrono", "chrono-tz-build", @@ -583,12 +583,11 @@ dependencies = [ [[package]] name = "chrono-tz-build" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1" +checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7" dependencies = [ "parse-zoneinfo", - "phf", "phf_codegen", ] @@ -1151,7 +1150,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "41.0.0" +version = "42.0.0" dependencies = [ "arrow", "async-trait", @@ -1161,14 +1160,10 @@ dependencies = [ "futures", "mimalloc", "object_store", - "parking_lot", "prost", "prost-types", "pyo3", "pyo3-build-config", - "rand", - "regex-syntax", - "syn", "tokio", "url", "uuid", @@ -1297,9 +1292,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -1312,9 +1307,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1322,15 +1317,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1339,15 +1334,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", @@ -1356,21 +1351,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -1407,9 +1402,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.31.0" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" @@ -1659,9 +1654,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.10.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "itertools" @@ -1704,9 +1699,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -1717,9 +1712,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -1728,9 +1723,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "0.8.6" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" dependencies = [ "lexical-util", "static_assertions", @@ -1738,18 +1733,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "0.8.5" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" dependencies = [ "lexical-util", "lexical-write-integer", @@ -1758,9 +1753,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" dependencies = [ "lexical-util", "static_assertions", @@ -2007,9 +2002,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.4" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] @@ -2047,12 +2042,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.1" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82881c4be219ab5faaf2ad5e5e5ecdff8c66bd7402ca3160975c93b24961afd1" -dependencies = [ - "portable-atomic", -] +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "openssl-probe" @@ -2094,9 +2086,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8" +checksum = "310c46a70a3ba90d98fec39fa2da6d9d731e544191da6fb56c9d199484d0dd3e" dependencies = [ "ahash", "arrow-array", @@ -2670,9 +2662,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.13" +version = "0.23.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" +checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" dependencies = [ "once_cell", "ring", @@ -2744,9 +2736,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.24" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b" +checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" dependencies = [ "windows-sys 0.59.0", ] diff --git a/Cargo.toml b/Cargo.toml index d2d3e79ed..df72cd40a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "41.0.0" +version = "42.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] @@ -35,25 +35,21 @@ substrait = ["dep:datafusion-substrait"] [dependencies] tokio = { version = "1.39", features = ["macros", "rt", "rt-multi-thread", "sync"] } -rand = "0.8" pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } arrow = { version = "53", features = ["pyarrow"] } datafusion = { version = "42.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } datafusion-substrait = { version = "42.0.0", optional = true } datafusion-proto = { version = "42.0.0" } prost = "0.13" # keep in line with `datafusion-substrait` -prost-types = "0.13" # keep in line with `datafusion-substrait` uuid = { version = "1.9", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } async-trait = "0.1" futures = "0.3" object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] } -parking_lot = "0.12" -regex-syntax = "0.8" -syn = "2.0.79" url = "2" [build-dependencies] +prost-types = "0.13" # keep in line with `datafusion-substrait` pyo3-build-config = "0.22" [lib] From cdec2025383ed2c6549ffaa37f1dee199e7d1f9b Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 8 Oct 2024 17:30:57 -0400 Subject: [PATCH 054/248] Ts/minor updates release process (#903) * Add instructions for updating submodule to test a release * Apply formatting to changelog script --- dev/release/README.md | 1 + dev/release/generate-changelog.py | 76 +++++++++++++++++++------------ 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/dev/release/README.md b/dev/release/README.md index 49fd9de2d..b2c015e1d 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -169,6 +169,7 @@ cd datafusion-python # checkout the release commit git fetch --tags git checkout 40.0.0-rc1 +git submodule update --init --recursive # create the env python3 -m venv venv diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index 5645d2f74..0f07457d0 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -22,18 +22,22 @@ import re import subprocess + def print_pulls(repo_name, title, pulls): - if len(pulls) > 0: + if len(pulls) > 0: print("**{}:**".format(title)) print() - for (pull, commit) in pulls: + for pull, commit in pulls: url = "https://github.com/{}/pull/{}".format(repo_name, pull.number) - print("- {} [#{}]({}) ({})".format(pull.title, pull.number, url, commit.author.login)) + print( + "- {} [#{}]({}) ({})".format( + pull.title, pull.number, url, commit.author.login + ) + ) print() def generate_changelog(repo, repo_name, tag1, tag2, version): - # get a list of commits between two tags print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr) comparison = repo.compare(tag1, tag2) @@ -61,29 +65,27 @@ def generate_changelog(repo, repo_name, tag1, tag2, version): # categorize the pull requests based on GitHub labels print("Categorizing pull requests", file=sys.stderr) - for (pull, commit) in all_pulls: - + for pull, commit in all_pulls: # see if PR title uses Conventional Commits - cc_type = '' - cc_scope = '' - cc_breaking = '' - parts = re.findall(r'^([a-z]+)(\([a-z]+\))?(!)?:', pull.title) + cc_type = "" + cc_breaking = "" + parts = re.findall(r"^([a-z]+)(\([a-z]+\))?(!)?:", pull.title) if len(parts) == 1: parts_tuple = parts[0] - cc_type = parts_tuple[0] # fix, feat, docs, chore - cc_scope = parts_tuple[1] # component within project - cc_breaking = parts_tuple[2] == '!' + cc_type = parts_tuple[0] # fix, feat, docs, chore + # cc_scope = parts_tuple[1] # component within project + cc_breaking = parts_tuple[2] == "!" labels = [label.name for label in pull.labels] - if 'api change' in labels or cc_breaking: + if "api change" in labels or cc_breaking: breaking.append((pull, commit)) - elif 'bug' in labels or cc_type == 'fix': + elif "bug" in labels or cc_type == "fix": bugs.append((pull, commit)) - elif 'performance' in labels or cc_type == 'perf': + elif "performance" in labels or cc_type == "perf": performance.append((pull, commit)) - elif 'enhancement' in labels or cc_type == 'feat': + elif "enhancement" in labels or cc_type == "feat": enhancements.append((pull, commit)) - elif 'documentation' in labels or cc_type == 'docs' or cc_type == 'doc': + elif "documentation" in labels or cc_type == "docs" or cc_type == "doc": docs.append((pull, commit)) else: other.append((pull, commit)) @@ -114,13 +116,19 @@ def generate_changelog(repo, repo_name, tag1, tag2, version): print(f"# Apache DataFusion Python {version} Changelog\n") # get the number of commits - commit_count = subprocess.check_output(f"git log --pretty=oneline {tag1}..{tag2} | wc -l", shell=True, text=True).strip() + commit_count = subprocess.check_output( + f"git log --pretty=oneline {tag1}..{tag2} | wc -l", shell=True, text=True + ).strip() # get number of contributors - contributor_count = subprocess.check_output(f"git shortlog -sn {tag1}..{tag2} | wc -l", shell=True, text=True).strip() + contributor_count = subprocess.check_output( + f"git shortlog -sn {tag1}..{tag2} | wc -l", shell=True, text=True + ).strip() - print(f"This release consists of {commit_count} commits from {contributor_count} contributors. " - f"See credits at the end of this changelog for more information.\n") + print( + f"This release consists of {commit_count} commits from {contributor_count} contributors. " + f"See credits at the end of this changelog for more information.\n" + ) print_pulls(repo_name, "Breaking changes", breaking) print_pulls(repo_name, "Performance related", performance) @@ -130,17 +138,24 @@ def generate_changelog(repo, repo_name, tag1, tag2, version): print_pulls(repo_name, "Other", other) # show code contributions - credits = subprocess.check_output(f"git shortlog -sn {tag1}..{tag2}", shell=True, text=True).rstrip() + credits = subprocess.check_output( + f"git shortlog -sn {tag1}..{tag2}", shell=True, text=True + ).rstrip() print("## Credits\n") - print("Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) " - "per contributor.\n") + print( + "Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) " + "per contributor.\n" + ) print("```") print(credits) print("```\n") - print("Thank you also to everyone who contributed in other ways such as filing issues, reviewing " - "PRs, and providing feedback on this release.\n") + print( + "Thank you also to everyone who contributed in other ways such as filing issues, reviewing " + "PRs, and providing feedback on this release.\n" + ) + def cli(args=None): """Process command line arguments.""" @@ -150,7 +165,9 @@ def cli(args=None): parser = argparse.ArgumentParser() parser.add_argument("tag1", help="The previous commit or tag (e.g. 0.1.0)") parser.add_argument("tag2", help="The current commit or tag (e.g. HEAD)") - parser.add_argument("version", help="The version number to include in the changelog") + parser.add_argument( + "version", help="The version number to include in the changelog" + ) args = parser.parse_args() token = os.getenv("GITHUB_TOKEN") @@ -160,5 +177,6 @@ def cli(args=None): repo = g.get_repo(project) generate_changelog(repo, project, args.tag1, args.tag2, args.version) + if __name__ == "__main__": - cli() \ No newline at end of file + cli() From 840b5de08dad7dd9cd0d8db19d9245dcace721ce Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Mon, 14 Oct 2024 09:55:46 -0500 Subject: [PATCH 055/248] fix: remove use of deprecated `make_scalar_function` (#906) * fix: remove use of deprecated `make_scalar_function` `make_scalar_function` has been deprecated since v36 [0]. It is being removed from the public api in v43 [1]. [0]: https://github.com/apache/datafusion/pull/8878 [1]: https://github.com/apache/datafusion/pull/12505 * remove use of `.unwrap()` from pyarrow_function_to_rust --- src/udf.rs | 65 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/src/udf.rs b/src/udf.rs index 7d5db2f96..ec8efb169 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -24,39 +24,56 @@ use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::FromPyArrow; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::error::DataFusionError; -use datafusion::logical_expr::create_udf; use datafusion::logical_expr::function::ScalarFunctionImplementation; use datafusion::logical_expr::ScalarUDF; +use datafusion::logical_expr::{create_udf, ColumnarValue}; use crate::expr::PyExpr; use crate::utils::parse_volatility; +/// Create a Rust callable function fr a python function that expects pyarrow arrays +fn pyarrow_function_to_rust( + func: PyObject, +) -> impl Fn(&[ArrayRef]) -> Result { + move |args: &[ArrayRef]| -> Result { + Python::with_gil(|py| { + // 1. cast args to Pyarrow arrays + let py_args = args + .iter() + .map(|arg| { + arg.into_data() + .to_pyarrow(py) + .map_err(|e| DataFusionError::Execution(format!("{e:?}"))) + }) + .collect::, _>>()?; + let py_args = PyTuple::new_bound(py, py_args); + + // 2. call function + let value = func + .call_bound(py, py_args, None) + .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; + + // 3. cast to arrow::array::Array + let array_data = ArrayData::from_pyarrow_bound(value.bind(py)) + .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; + Ok(make_array(array_data)) + }) + } +} + /// Create a DataFusion's UDF implementation from a python function /// that expects pyarrow arrays. This is more efficient as it performs /// a zero-copy of the contents. -fn to_rust_function(func: PyObject) -> ScalarFunctionImplementation { - #[allow(deprecated)] - datafusion::physical_plan::functions::make_scalar_function( - move |args: &[ArrayRef]| -> Result { - Python::with_gil(|py| { - // 1. cast args to Pyarrow arrays - let py_args = args - .iter() - .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) - .collect::>(); - let py_args = PyTuple::new_bound(py, py_args); - - // 2. call function - let value = func - .call_bound(py, py_args, None) - .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; +fn to_scalar_function_impl(func: PyObject) -> ScalarFunctionImplementation { + // Make the python function callable from rust + let pyarrow_func = pyarrow_function_to_rust(func); - // 3. cast to arrow::array::Array - let array_data = ArrayData::from_pyarrow_bound(value.bind(py)).unwrap(); - Ok(make_array(array_data)) - }) - }, - ) + // Convert input/output from datafusion ColumnarValue to arrow arrays + Arc::new(move |args: &[ColumnarValue]| { + let array_refs = ColumnarValue::values_to_arrays(args)?; + let array_result = pyarrow_func(&array_refs)?; + Ok(array_result.into()) + }) } /// Represents a PyScalarUDF @@ -82,7 +99,7 @@ impl PyScalarUDF { input_types.0, Arc::new(return_type.0), parse_volatility(volatility)?, - to_rust_function(func), + to_scalar_function_impl(func), ); Ok(Self { function }) } From 3d751728467a4210007bd9d266f22ae0e291d63f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 14 Oct 2024 12:05:45 -0500 Subject: [PATCH 056/248] build(deps): bump pyo3 from 0.22.3 to 0.22.4 (#910) Bumps [pyo3](https://github.com/pyo3/pyo3) from 0.22.3 to 0.22.4. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/commits) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 815323bf4..56c368f46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2342,9 +2342,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15ee168e30649f7f234c3d49ef5a7a6cbf5134289bc46c29ff3155fa3221c225" +checksum = "00e89ce2565d6044ca31a3eb79a334c3a79a841120a98f64eea9f579564cb691" dependencies = [ "cfg-if", "indoc", @@ -2360,9 +2360,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e61cef80755fe9e46bb8a0b8f20752ca7676dcc07a5277d8b7768c6172e529b3" +checksum = "d8afbaf3abd7325e08f35ffb8deb5892046fcb2608b703db6a583a5ba4cea01e" dependencies = [ "once_cell", "target-lexicon", @@ -2370,9 +2370,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ce096073ec5405f5ee2b8b31f03a68e02aa10d5d4f565eca04acc41931fa1c" +checksum = "ec15a5ba277339d04763f4c23d85987a5b08cbb494860be141e6a10a8eb88022" dependencies = [ "libc", "pyo3-build-config", @@ -2380,9 +2380,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2440c6d12bc8f3ae39f1e775266fa5122fd0c8891ce7520fa6048e683ad3de28" +checksum = "15e0f01b5364bcfbb686a52fc4181d412b708a68ed20c330db9fc8d2c2bf5a43" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -2392,9 +2392,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1be962f0e06da8f8465729ea2cb71a416d2257dff56cbe40a70d3e62a93ae5d1" +checksum = "a09b550200e1e5ed9176976d0060cbc2ea82dc8515da07885e7b8153a85caacb" dependencies = [ "heck 0.5.0", "proc-macro2", From 72f274385792d6eee3d9053ed786966b4899b24d Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:20:22 +0200 Subject: [PATCH 057/248] feat: expose drop method (#913) --- python/datafusion/dataframe.py | 11 +++++++++++ python/tests/test_dataframe.py | 11 +++++++++++ src/dataframe.rs | 7 +++++++ 3 files changed, 29 insertions(+) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index c5ac0bb89..a9e4d4d10 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -129,6 +129,17 @@ def select(self, *exprs: Expr | str) -> DataFrame: ] return DataFrame(self.df.select(*exprs_internal)) + def drop(self, *columns: str) -> DataFrame: + """Drop arbitrary amount of columns. + + Args: + columns: Column names to drop from the dataframe. + + Returns: + DataFrame with those columns removed in the projection. + """ + return DataFrame(self.df.drop(*columns)) + def filter(self, *predicates: Expr) -> DataFrame: """Return a DataFrame for which ``predicate`` evaluates to ``True``. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e89c57159..88c642a7d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -169,6 +169,17 @@ def test_sort(df): assert table.to_pydict() == expected +def test_drop(df): + df = df.drop("c") + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert df.schema().names == ["a", "b"] + assert result.column(0) == pa.array([1, 2, 3]) + assert result.column(1) == pa.array([4, 5, 6]) + + def test_limit(df): df = df.limit(1) diff --git a/src/dataframe.rs b/src/dataframe.rs index e77ca8425..db243704a 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -170,6 +170,13 @@ impl PyDataFrame { Ok(Self::new(df)) } + #[pyo3(signature = (*args))] + fn drop(&self, args: Vec) -> PyResult { + let cols = args.iter().map(|s| s.as_ref()).collect::>(); + let df = self.df.as_ref().clone().drop_columns(&cols)?; + Ok(Self::new(df)) + } + fn filter(&self, predicate: PyExpr) -> PyResult { let df = self.df.as_ref().clone().filter(predicate.into())?; Ok(Self::new(df)) From b4b03fe10fab72cc5606a193b58e1c9ae5031318 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:20:39 +0200 Subject: [PATCH 058/248] feat: expose `join_on` (#914) * feat: expose join_on method * test: improve join_on case --- python/datafusion/dataframe.py | 25 ++++++++++++++++++++++- python/tests/test_dataframe.py | 36 ++++++++++++++++++++++++++++++++++ src/dataframe.rs | 25 +++++++++++++++++++++++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index a9e4d4d10..60203ffb4 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -21,7 +21,7 @@ from __future__ import annotations -from typing import Any, List, TYPE_CHECKING +from typing import Any, List, TYPE_CHECKING, Literal from datafusion.record_batch import RecordBatchStream from typing_extensions import deprecated from datafusion.plan import LogicalPlan, ExecutionPlan @@ -304,6 +304,29 @@ def join( """ return DataFrame(self.df.join(right.df, join_keys, how)) + def join_on( + self, + right: DataFrame, + *on_exprs: Expr, + how: Literal["inner", "left", "right", "full", "semi", "anti"] = "inner", + ) -> DataFrame: + """Join two :py:class:`DataFrame`using the specified expressions. + + On expressions are used to support in-equality predicates. Equality + predicates are correctly optimized + + Args: + right: Other DataFrame to join with. + on_exprs: single or multiple (in)-equality predicates. + how: Type of join to perform. Supported types are "inner", "left", + "right", "full", "semi", "anti". + + Returns: + DataFrame after join. + """ + exprs = [expr.expr for expr in on_exprs] + return DataFrame(self.df.join_on(right.df, exprs, how)) + def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: """Return a DataFrame with the explanation of its plan so far. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 88c642a7d..6330ede04 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -270,6 +270,42 @@ def test_join(): assert table.to_pydict() == expected +def test_join_on(): + ctx = SessionContext() + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]], "l") + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2]), pa.array([-8, 10])], + names=["a", "c"], + ) + df1 = ctx.create_dataframe([[batch]], "r") + + df2 = df.join_on(df1, column("l.a").__eq__(column("r.a")), how="inner") + df2.show() + df2 = df2.sort(column("l.a")) + table = pa.Table.from_batches(df2.collect()) + + expected = {"a": [1, 2], "c": [-8, 10], "b": [4, 5]} + assert table.to_pydict() == expected + + df3 = df.join_on( + df1, + column("l.a").__eq__(column("r.a")), + column("l.a").__lt__(column("r.c")), + how="inner", + ) + df3.show() + df3 = df3.sort(column("l.a")) + table = pa.Table.from_batches(df3.collect()) + expected = {"a": [2], "c": [10], "b": [5]} + assert table.to_pydict() == expected + + def test_distinct(): ctx = SessionContext() diff --git a/src/dataframe.rs b/src/dataframe.rs index db243704a..fa6c1d44f 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -300,6 +300,31 @@ impl PyDataFrame { Ok(Self::new(df)) } + fn join_on(&self, right: PyDataFrame, on_exprs: Vec, how: &str) -> PyResult { + let join_type = match how { + "inner" => JoinType::Inner, + "left" => JoinType::Left, + "right" => JoinType::Right, + "full" => JoinType::Full, + "semi" => JoinType::LeftSemi, + "anti" => JoinType::LeftAnti, + how => { + return Err(DataFusionError::Common(format!( + "The join type {how} does not exist or is not implemented" + )) + .into()); + } + }; + let exprs: Vec = on_exprs.into_iter().map(|e| e.into()).collect(); + + let df = self + .df + .as_ref() + .clone() + .join_on(right.df.as_ref().clone(), join_type, exprs)?; + Ok(Self::new(df)) + } + /// Print the query plan #[pyo3(signature = (verbose=false, analyze=false))] fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyResult<()> { From 494b89a522541bbaf9c3cd5d7b6bd7ab7218a399 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:21:30 +0200 Subject: [PATCH 059/248] refactor: from_arrow (#917) --- python/datafusion/context.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 957d7e311..5221c866c 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -30,7 +30,7 @@ from datafusion.record_batch import RecordBatchStream from datafusion.udf import ScalarUDF, AggregateUDF, WindowUDF -from typing import Any, TYPE_CHECKING +from typing import Any, TYPE_CHECKING, Protocol from typing_extensions import deprecated if TYPE_CHECKING: @@ -41,6 +41,28 @@ from datafusion.plan import LogicalPlan, ExecutionPlan +class ArrowStreamExportable(Protocol): + """Type hint for object exporting Arrow C Stream via Arrow PyCapsule Interface. + + https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + """ + + def __arrow_c_stream__( # noqa: D105 + self, requested_schema: object | None = None + ) -> object: ... + + +class ArrowArrayExportable(Protocol): + """Type hint for object exporting Arrow C Array via Arrow PyCapsule Interface. + + https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + """ + + def __arrow_c_array__( # noqa: D105 + self, requested_schema: object | None = None + ) -> tuple[object, object]: ... + + class SessionConfig: """Session configuration options.""" @@ -592,12 +614,18 @@ def from_pydict( """ return DataFrame(self.ctx.from_pydict(data, name)) - def from_arrow(self, data: Any, name: str | None = None) -> DataFrame: + def from_arrow( + self, + data: ArrowStreamExportable | ArrowArrayExportable, + name: str | None = None, + ) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow source. The Arrow data source can be any object that implements either ``__arrow_c_stream__`` or ``__arrow_c_array__``. For the latter, it must return - a struct array. Common examples of sources from pyarrow include + a struct array. + + Arrow data can be Polars, Pandas, Pyarrow etc. Args: data: Arrow data source. From 0905f5fca4b763fc61e5e2093a85ad05e203d7fb Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Wed, 16 Oct 2024 14:36:56 +0200 Subject: [PATCH 060/248] feat: add fill_null/nan (#919) --- python/datafusion/expr.py | 12 ++++++++++++ python/datafusion/functions.py | 6 ++++++ python/tests/test_expr.py | 33 ++++++++++++++++++++++++++++++--- src/functions.rs | 6 ++++++ 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 8600627ae..c4e7713f3 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -406,6 +406,18 @@ def is_not_null(self) -> Expr: """Returns ``True`` if this expression is not null.""" return Expr(self.expr.is_not_null()) + def fill_nan(self, value: Any | Expr | None = None) -> Expr: + """Fill NaN values with a provided value.""" + if not isinstance(value, Expr): + value = Expr.literal(value) + return Expr(functions_internal.nanvl(self.expr, value.expr)) + + def fill_null(self, value: Any | Expr | None = None) -> Expr: + """Fill NULL values with a provided value.""" + if not isinstance(value, Expr): + value = Expr.literal(value) + return Expr(functions_internal.nvl(self.expr, value.expr)) + _to_pyarrow_types = { float: pa.float64(), int: pa.int64(), diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 0401afbc4..727321979 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -186,6 +186,7 @@ "min", "named_struct", "nanvl", + "nvl", "now", "nth_value", "nullif", @@ -673,6 +674,11 @@ def nanvl(x: Expr, y: Expr) -> Expr: return Expr(f.nanvl(x.expr, y.expr)) +def nvl(x: Expr, y: Expr) -> Expr: + """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``.""" + return Expr(f.nvl(x.expr, y.expr)) + + def octet_length(arg: Expr) -> Expr: """Returns the number of bytes of a string.""" return Expr(f.octet_length(arg.expr)) diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index b58177f16..1847edef2 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import pyarrow +import pyarrow as pa import pytest from datafusion import SessionContext, col from datafusion.expr import ( @@ -125,8 +125,8 @@ def test_sort(test_ctx): def test_relational_expr(test_ctx): ctx = SessionContext() - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array(["alpha", "beta", "gamma"])], + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array(["alpha", "beta", "gamma"])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]], name="batch_array") @@ -216,3 +216,30 @@ def test_display_name_deprecation(): # returns appropriate result assert name == expr.schema_name() assert name == "foo" + + +@pytest.fixture +def df(): + ctx = SessionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, None]), pa.array([4, None, 6]), pa.array([None, None, 8])], + names=["a", "b", "c"], + ) + + return ctx.from_arrow(batch) + + +def test_fill_null(df): + df = df.select( + col("a").fill_null(100).alias("a"), + col("b").fill_null(25).alias("b"), + col("c").fill_null(1234).alias("c"), + ) + df.show() + result = df.collect()[0] + + assert result.column(0) == pa.array([1, 2, 100]) + assert result.column(1) == pa.array([4, 25, 6]) + assert result.column(2) == pa.array([1234, 1234, 8]) diff --git a/src/functions.rs b/src/functions.rs index 6f8dd7ada..24d33af39 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -490,6 +490,11 @@ expr_fn!( x y, "Returns x if x is not NaN otherwise returns y." ); +expr_fn!( + nvl, + x y, + "Returns x if x is not NULL otherwise returns y." +); expr_fn!(nullif, arg_1 arg_2); expr_fn!(octet_length, args, "Returns number of bytes in the string. Since this version of the function accepts type character directly, it will not strip trailing spaces."); expr_fn_vec!(overlay); @@ -913,6 +918,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(min))?; m.add_wrapped(wrap_pyfunction!(named_struct))?; m.add_wrapped(wrap_pyfunction!(nanvl))?; + m.add_wrapped(wrap_pyfunction!(nvl))?; m.add_wrapped(wrap_pyfunction!(now))?; m.add_wrapped(wrap_pyfunction!(nullif))?; m.add_wrapped(wrap_pyfunction!(octet_length))?; From fc7e3e546dbb03783e66d44837d75a1e6ad62827 Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 19 Oct 2024 06:15:28 +0800 Subject: [PATCH 061/248] Change requires-python version (#924) --- pyproject.toml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6e10333a0..d327c0ec1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,8 +23,8 @@ build-backend = "maturin" name = "datafusion" description = "Build and run queries against data" readme = "README.md" -license = {file = "LICENSE.txt"} -requires-python = ">=3.6" +license = { file = "LICENSE.txt" } +requires-python = ">=3.7" keywords = ["datafusion", "dataframe", "rust", "query-engine"] classifier = [ "Development Status :: 2 - Pre-Alpha", @@ -42,10 +42,7 @@ classifier = [ "Programming Language :: Python", "Programming Language :: Rust", ] -dependencies = [ - "pyarrow>=11.0.0", - "typing-extensions;python_version<'3.13'", -] +dependencies = ["pyarrow>=11.0.0", "typing-extensions;python_version<'3.13'"] [project.urls] homepage = "https://datafusion.apache.org/python" @@ -58,9 +55,7 @@ profile = "black" [tool.maturin] python-source = "python" module-name = "datafusion._internal" -include = [ - { path = "Cargo.lock", format = "sdist" } -] +include = [{ path = "Cargo.lock", format = "sdist" }] exclude = [".github/**", "ci/**", ".asf.yaml"] # Require Cargo.lock is up to date locked = true From 7cca0283e7837426e6ccdf9d5c8cbfc0c8b239c9 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Sat, 19 Oct 2024 00:57:57 +0200 Subject: [PATCH 062/248] feat: add `with_columns` (#909) * feat: add with_columns * chore: add doc * Format docstring to render in online documentation --------- Co-authored-by: Tim Saucer --- python/datafusion/dataframe.py | 47 +++++++++++++++++++++++++++++++++- python/tests/test_dataframe.py | 31 ++++++++++++++++++++++ src/dataframe.rs | 10 ++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 60203ffb4..9c0953c35 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -21,7 +21,7 @@ from __future__ import annotations -from typing import Any, List, TYPE_CHECKING, Literal +from typing import Any, Iterable, List, Literal, TYPE_CHECKING from datafusion.record_batch import RecordBatchStream from typing_extensions import deprecated from datafusion.plan import LogicalPlan, ExecutionPlan @@ -171,6 +171,51 @@ def with_column(self, name: str, expr: Expr) -> DataFrame: """ return DataFrame(self.df.with_column(name, expr.expr)) + def with_columns( + self, *exprs: Expr | Iterable[Expr], **named_exprs: Expr + ) -> DataFrame: + """Add columns to the DataFrame. + + By passing expressions, iteratables of expressions, or named expressions. To + pass named expressions use the form name=Expr. + + Example usage: The following will add 4 columns labeled a, b, c, and d:: + + df = df.with_columns( + lit(0).alias('a'), + [lit(1).alias('b'), lit(2).alias('c')], + d=lit(3) + ) + + Args: + exprs: Either a single expression or an iterable of expressions to add. + named_exprs: Named expressions in the form of ``name=expr`` + + Returns: + DataFrame with the new columns added. + """ + + def _simplify_expression( + *exprs: Expr | Iterable[Expr], **named_exprs: Expr + ) -> list[Expr]: + expr_list = [] + for expr in exprs: + if isinstance(expr, Expr): + expr_list.append(expr.expr) + elif isinstance(expr, Iterable): + for inner_expr in expr: + expr_list.append(inner_expr.expr) + else: + raise NotImplementedError + if named_exprs: + for alias, expr in named_exprs.items(): + expr_list.append(expr.alias(alias).expr) + return expr_list + + expressions = _simplify_expression(*exprs, **named_exprs) + + return DataFrame(self.df.with_columns(expressions)) + def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame: r"""Rename one column by applying a new projection. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 6330ede04..0d4a7dcb0 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -216,6 +216,37 @@ def test_with_column(df): assert result.column(2) == pa.array([5, 7, 9]) +def test_with_columns(df): + df = df.with_columns( + (column("a") + column("b")).alias("c"), + (column("a") + column("b")).alias("d"), + [ + (column("a") + column("b")).alias("e"), + (column("a") + column("b")).alias("f"), + ], + g=(column("a") + column("b")), + ) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.schema.field(0).name == "a" + assert result.schema.field(1).name == "b" + assert result.schema.field(2).name == "c" + assert result.schema.field(3).name == "d" + assert result.schema.field(4).name == "e" + assert result.schema.field(5).name == "f" + assert result.schema.field(6).name == "g" + + assert result.column(0) == pa.array([1, 2, 3]) + assert result.column(1) == pa.array([4, 5, 6]) + assert result.column(2) == pa.array([5, 7, 9]) + assert result.column(3) == pa.array([5, 7, 9]) + assert result.column(4) == pa.array([5, 7, 9]) + assert result.column(5) == pa.array([5, 7, 9]) + assert result.column(6) == pa.array([5, 7, 9]) + + def test_with_column_renamed(df): df = df.with_column("c", column("a") + column("b")).with_column_renamed("c", "sum") diff --git a/src/dataframe.rs b/src/dataframe.rs index fa6c1d44f..dd5d89ce9 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -187,6 +187,16 @@ impl PyDataFrame { Ok(Self::new(df)) } + fn with_columns(&self, exprs: Vec) -> PyResult { + let mut df = self.df.as_ref().clone(); + for expr in exprs { + let expr: Expr = expr.into(); + let name = format!("{}", expr.schema_name()); + df = df.with_column(name.as_str(), expr)? + } + Ok(Self::new(df)) + } + /// Rename one column by applying a new projection. This is a no-op if the column to be /// renamed does not exist. fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyResult { From 70c099aad8ec337ef88e27c125a8eeba328d62de Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Mon, 21 Oct 2024 19:21:25 +0200 Subject: [PATCH 063/248] feat: add `cast` to DataFrame (#916) * feat: add with_columns * feat: add top level cast * chore: improve docstring --------- Co-authored-by: Tim Saucer --- python/datafusion/dataframe.py | 13 +++++++++++++ python/tests/test_dataframe.py | 9 +++++++++ 2 files changed, 22 insertions(+) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 9c0953c35..3ed6d40fe 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -21,6 +21,7 @@ from __future__ import annotations + from typing import Any, Iterable, List, Literal, TYPE_CHECKING from datafusion.record_batch import RecordBatchStream from typing_extensions import deprecated @@ -267,6 +268,18 @@ def sort(self, *exprs: Expr | SortExpr) -> DataFrame: exprs_raw = [sort_or_default(expr) for expr in exprs] return DataFrame(self.df.sort(*exprs_raw)) + def cast(self, mapping: dict[str, pa.DataType[Any]]) -> DataFrame: + """Cast one or more columns to a different data type. + + Args: + mapping: Mapped with column as key and column dtype as value. + + Returns: + DataFrame after casting columns + """ + exprs = [Expr.column(col).cast(dtype) for col, dtype in mapping.items()] + return self.with_columns(exprs) + def limit(self, count: int, offset: int = 0) -> DataFrame: """Return a new :py:class:`DataFrame` with a limited number of rows. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 0d4a7dcb0..bb408c9c9 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -247,6 +247,15 @@ def test_with_columns(df): assert result.column(6) == pa.array([5, 7, 9]) +def test_cast(df): + df = df.cast({"a": pa.float16(), "b": pa.list_(pa.uint32())}) + expected = pa.schema( + [("a", pa.float16()), ("b", pa.list_(pa.uint32())), ("c", pa.int64())] + ) + + assert df.schema() == expected + + def test_with_column_renamed(df): df = df.with_column("c", column("a") + column("b")).with_column_renamed("c", "sum") From f59dd08bfbc0f01cc16b858465d03c3a01ba647c Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Mon, 21 Oct 2024 23:02:49 +0200 Subject: [PATCH 064/248] feat: add `head`, `tail` methods (#915) * feat: add head, tail methods * chore: add default head/tail --- python/datafusion/dataframe.py | 25 +++++++++++++++++++++++++ python/tests/test_dataframe.py | 22 ++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 3ed6d40fe..e4f8073d3 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -292,6 +292,31 @@ def limit(self, count: int, offset: int = 0) -> DataFrame: """ return DataFrame(self.df.limit(count, offset)) + def head(self, n: int = 5) -> DataFrame: + """Return a new :py:class:`DataFrame` with a limited number of rows. + + Args: + n: Number of rows to take from the head of the DataFrame. + + Returns: + DataFrame after limiting. + """ + return DataFrame(self.df.limit(n, 0)) + + def tail(self, n: int = 5) -> DataFrame: + """Return a new :py:class:`DataFrame` with a limited number of rows. + + Be aware this could be potentially expensive since the row size needs to be + determined of the dataframe. This is done by collecting it. + + Args: + n: Number of rows to take from the tail of the DataFrame. + + Returns: + DataFrame after limiting. + """ + return DataFrame(self.df.limit(n, max(0, self.count() - n))) + def collect(self) -> list[pa.RecordBatch]: """Execute this :py:class:`DataFrame` and collect results into memory. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index bb408c9c9..d73f5ebde 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -201,6 +201,28 @@ def test_limit_with_offset(df): assert len(result.column(1)) == 1 +def test_head(df): + df = df.head(1) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pa.array([1]) + assert result.column(1) == pa.array([4]) + assert result.column(2) == pa.array([8]) + + +def test_tail(df): + df = df.tail(1) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pa.array([3]) + assert result.column(1) == pa.array([6]) + assert result.column(2) == pa.array([8]) + + def test_with_column(df): df = df.with_column("c", column("a") + column("b")) From 56b72438004965f36dd4ce7e14d62a533ab2026f Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:23:36 +0200 Subject: [PATCH 065/248] chore: deprecate `select_columns` (#911) * chore: deprecate select_columns * chore: lint * Update user document to use select instead of select_columns * Update all tpch examples to use select instead of select_columns --------- Co-authored-by: Tim Saucer --- .../common-operations/select-and-filter.rst | 4 ++-- examples/import.py | 10 +++++----- examples/tpch/convert_data_to_parquet.py | 2 +- examples/tpch/q02_minimum_cost_supplier.py | 12 ++++++------ examples/tpch/q03_shipping_priority.py | 8 ++++---- examples/tpch/q04_order_priority_checking.py | 6 +++--- examples/tpch/q05_local_supplier_volume.py | 12 ++++++------ .../tpch/q06_forecasting_revenue_change.py | 2 +- examples/tpch/q07_volume_shipping.py | 10 +++++----- examples/tpch/q08_market_share.py | 18 ++++++++---------- .../tpch/q09_product_type_profit_measure.py | 14 ++++++-------- examples/tpch/q10_returned_item_reporting.py | 10 +++++----- .../tpch/q11_important_stock_identification.py | 8 ++++---- examples/tpch/q12_ship_mode_order_priority.py | 4 ++-- examples/tpch/q13_customer_distribution.py | 6 ++---- examples/tpch/q14_promotion_effect.py | 6 ++---- examples/tpch/q15_top_supplier.py | 6 +++--- .../tpch/q16_part_supplier_relationship.py | 8 ++++---- examples/tpch/q17_small_quantity_order.py | 4 ++-- examples/tpch/q18_large_volume_customer.py | 8 ++++---- examples/tpch/q19_discounted_revenue.py | 4 ++-- examples/tpch/q20_potential_part_promotion.py | 14 ++++++-------- .../tpch/q21_suppliers_kept_orders_waiting.py | 8 ++++---- examples/tpch/q22_global_sales_opportunity.py | 6 ++---- python/datafusion/dataframe.py | 3 +++ python/tests/test_dataframe.py | 14 ++++++-------- 26 files changed, 98 insertions(+), 109 deletions(-) diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst b/docs/source/user-guide/common-operations/select-and-filter.rst index 92b4841b2..075909129 100644 --- a/docs/source/user-guide/common-operations/select-and-filter.rst +++ b/docs/source/user-guide/common-operations/select-and-filter.rst @@ -33,7 +33,7 @@ DataFusion can work with several file types, to start simple we can use a subset ctx = SessionContext() df = ctx.read_parquet("yellow_trip_data.parquet") - df.select_columns("trip_distance", "passenger_count") + df.select("trip_distance", "passenger_count") For mathematical or logical operations use :py:func:`~datafusion.col` to select columns, and give meaningful names to the resulting operations using :py:func:`~datafusion.expr.Expr.alias` @@ -48,7 +48,7 @@ operations using :py:func:`~datafusion.expr.Expr.alias` Please be aware that all identifiers are effectively made lower-case in SQL, so if your file has capital letters (ex: Name) you must put your column name in double quotes or the selection won’t work. As an alternative for simple - column selection use :py:func:`~datafusion.dataframe.DataFrame.select_columns` without double quotes + column selection use :py:func:`~datafusion.dataframe.DataFrame.select` without double quotes For selecting columns with capital letters use ``'"VendorID"'`` diff --git a/examples/import.py b/examples/import.py index cd965cb46..c9d2e8cb6 100644 --- a/examples/import.py +++ b/examples/import.py @@ -28,7 +28,7 @@ # The dictionary keys represent column names and the dictionary values # represent column values df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) -assert type(df) == datafusion.DataFrame +assert type(df) is datafusion.DataFrame # Dataframe: # +---+---+ # | a | b | @@ -40,19 +40,19 @@ # Create a datafusion DataFrame from a Python list of rows df = ctx.from_pylist([{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]) -assert type(df) == datafusion.DataFrame +assert type(df) is datafusion.DataFrame # Convert pandas DataFrame to datafusion DataFrame pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df = ctx.from_pandas(pandas_df) -assert type(df) == datafusion.DataFrame +assert type(df) is datafusion.DataFrame # Convert polars DataFrame to datafusion DataFrame polars_df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df = ctx.from_polars(polars_df) -assert type(df) == datafusion.DataFrame +assert type(df) is datafusion.DataFrame # Convert Arrow Table to datafusion DataFrame arrow_table = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) df = ctx.from_arrow(arrow_table) -assert type(df) == datafusion.DataFrame +assert type(df) is datafusion.DataFrame diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py index a8091a708..cb0b2f0bd 100644 --- a/examples/tpch/convert_data_to_parquet.py +++ b/examples/tpch/convert_data_to_parquet.py @@ -138,6 +138,6 @@ df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|") - df = df.select_columns(*output_cols) + df = df.select(*output_cols) df.write_parquet(dest_file, compression="snappy") diff --git a/examples/tpch/q02_minimum_cost_supplier.py b/examples/tpch/q02_minimum_cost_supplier.py index 2171a2083..2440fdad6 100644 --- a/examples/tpch/q02_minimum_cost_supplier.py +++ b/examples/tpch/q02_minimum_cost_supplier.py @@ -43,10 +43,10 @@ ctx = SessionContext() -df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns( +df_part = ctx.read_parquet(get_data_path("part.parquet")).select( "p_partkey", "p_mfgr", "p_type", "p_size" ) -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_acctbal", "s_name", "s_address", @@ -55,13 +55,13 @@ "s_nationkey", "s_suppkey", ) -df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns( +df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select( "ps_partkey", "ps_suppkey", "ps_supplycost" ) -df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns( +df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select( "n_nationkey", "n_regionkey", "n_name" ) -df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns( +df_region = ctx.read_parquet(get_data_path("region.parquet")).select( "r_regionkey", "r_name" ) @@ -115,7 +115,7 @@ # From the problem statement, these are the values we wish to output -df = df.select_columns( +df = df.select( "s_acctbal", "s_name", "n_name", diff --git a/examples/tpch/q03_shipping_priority.py b/examples/tpch/q03_shipping_priority.py index 6a4886d83..c4e8f461a 100644 --- a/examples/tpch/q03_shipping_priority.py +++ b/examples/tpch/q03_shipping_priority.py @@ -37,13 +37,13 @@ ctx = SessionContext() -df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns( +df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select( "c_mktsegment", "c_custkey" ) -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_orderdate", "o_shippriority", "o_custkey", "o_orderkey" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_orderkey", "l_extendedprice", "l_discount", "l_shipdate" ) @@ -80,7 +80,7 @@ # Change the order that the columns are reported in just to match the spec -df = df.select_columns("l_orderkey", "revenue", "o_orderdate", "o_shippriority") +df = df.select("l_orderkey", "revenue", "o_orderdate", "o_shippriority") # Show result diff --git a/examples/tpch/q04_order_priority_checking.py b/examples/tpch/q04_order_priority_checking.py index 77c3bd43e..f10b74d91 100644 --- a/examples/tpch/q04_order_priority_checking.py +++ b/examples/tpch/q04_order_priority_checking.py @@ -39,10 +39,10 @@ ctx = SessionContext() -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_orderdate", "o_orderpriority", "o_orderkey" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_orderkey", "l_commitdate", "l_receiptdate" ) @@ -54,7 +54,7 @@ # Limit results to cases where commitment date before receipt date # Aggregate the results so we only get one row to join with the order table. # Alternately, and likely more idiomatic is instead of `.aggregate` you could -# do `.select_columns("l_orderkey").distinct()`. The goal here is to show +# do `.select("l_orderkey").distinct()`. The goal here is to show # multiple examples of how to use Data Fusion. df_lineitem = df_lineitem.filter(col("l_commitdate") < col("l_receiptdate")).aggregate( [col("l_orderkey")], [] diff --git a/examples/tpch/q05_local_supplier_volume.py b/examples/tpch/q05_local_supplier_volume.py index f17f600a4..2a83d2d1a 100644 --- a/examples/tpch/q05_local_supplier_volume.py +++ b/examples/tpch/q05_local_supplier_volume.py @@ -47,22 +47,22 @@ ctx = SessionContext() -df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns( +df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select( "c_custkey", "c_nationkey" ) -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_custkey", "o_orderkey", "o_orderdate" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_orderkey", "l_suppkey", "l_extendedprice", "l_discount" ) -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_suppkey", "s_nationkey" ) -df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns( +df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select( "n_nationkey", "n_regionkey", "n_name" ) -df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns( +df_region = ctx.read_parquet(get_data_path("region.parquet")).select( "r_regionkey", "r_name" ) diff --git a/examples/tpch/q06_forecasting_revenue_change.py b/examples/tpch/q06_forecasting_revenue_change.py index 3beb9eb1f..eaf9b0c29 100644 --- a/examples/tpch/q06_forecasting_revenue_change.py +++ b/examples/tpch/q06_forecasting_revenue_change.py @@ -51,7 +51,7 @@ ctx = SessionContext() -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_shipdate", "l_quantity", "l_extendedprice", "l_discount" ) diff --git a/examples/tpch/q07_volume_shipping.py b/examples/tpch/q07_volume_shipping.py index 44c605a9b..a1d7d81ad 100644 --- a/examples/tpch/q07_volume_shipping.py +++ b/examples/tpch/q07_volume_shipping.py @@ -49,19 +49,19 @@ ctx = SessionContext() -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_suppkey", "s_nationkey" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_shipdate", "l_extendedprice", "l_discount", "l_suppkey", "l_orderkey" ) -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_orderkey", "o_custkey" ) -df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns( +df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select( "c_custkey", "c_nationkey" ) -df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns( +df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select( "n_nationkey", "n_name" ) diff --git a/examples/tpch/q08_market_share.py b/examples/tpch/q08_market_share.py index cd6bc1fa9..95fc0a871 100644 --- a/examples/tpch/q08_market_share.py +++ b/examples/tpch/q08_market_share.py @@ -47,25 +47,23 @@ ctx = SessionContext() -df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns( - "p_partkey", "p_type" -) -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey", "p_type") +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_suppkey", "s_nationkey" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_partkey", "l_extendedprice", "l_discount", "l_suppkey", "l_orderkey" ) -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_orderkey", "o_custkey", "o_orderdate" ) -df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns( +df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select( "c_custkey", "c_nationkey" ) -df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns( +df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select( "n_nationkey", "n_name", "n_regionkey" ) -df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns( +df_region = ctx.read_parquet(get_data_path("region.parquet")).select( "r_regionkey", "r_name" ) @@ -133,7 +131,7 @@ # When we join to the customer dataframe, we don't want to confuse other columns, so only # select the supplier key that we need -df_national_suppliers = df_national_suppliers.select_columns("s_suppkey") +df_national_suppliers = df_national_suppliers.select("s_suppkey") # Part 3: Combine suppliers and customers and compute the market share diff --git a/examples/tpch/q09_product_type_profit_measure.py b/examples/tpch/q09_product_type_profit_measure.py index b4a7369f8..0295d3025 100644 --- a/examples/tpch/q09_product_type_profit_measure.py +++ b/examples/tpch/q09_product_type_profit_measure.py @@ -39,16 +39,14 @@ ctx = SessionContext() -df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns( - "p_partkey", "p_name" -) -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey", "p_name") +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_suppkey", "s_nationkey" ) -df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns( +df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select( "ps_suppkey", "ps_partkey", "ps_supplycost" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_partkey", "l_extendedprice", "l_discount", @@ -56,10 +54,10 @@ "l_orderkey", "l_quantity", ) -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_orderkey", "o_custkey", "o_orderdate" ) -df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns( +df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select( "n_nationkey", "n_name", "n_regionkey" ) diff --git a/examples/tpch/q10_returned_item_reporting.py b/examples/tpch/q10_returned_item_reporting.py index 78327c3ad..25f81b2ff 100644 --- a/examples/tpch/q10_returned_item_reporting.py +++ b/examples/tpch/q10_returned_item_reporting.py @@ -44,7 +44,7 @@ ctx = SessionContext() -df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns( +df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select( "c_custkey", "c_nationkey", "c_name", @@ -53,13 +53,13 @@ "c_phone", "c_comment", ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_extendedprice", "l_discount", "l_orderkey", "l_returnflag" ) -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_orderkey", "o_custkey", "o_orderdate" ) -df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns( +df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select( "n_nationkey", "n_name", "n_regionkey" ) @@ -87,7 +87,7 @@ df = df.join(df_nation, (["c_nationkey"], ["n_nationkey"]), how="inner") # These are the columns the problem statement requires -df = df.select_columns( +df = df.select( "c_custkey", "c_name", "revenue", diff --git a/examples/tpch/q11_important_stock_identification.py b/examples/tpch/q11_important_stock_identification.py index 391eb45b1..86ff2296b 100644 --- a/examples/tpch/q11_important_stock_identification.py +++ b/examples/tpch/q11_important_stock_identification.py @@ -37,13 +37,13 @@ ctx = SessionContext() -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_suppkey", "s_nationkey" ) -df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns( +df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select( "ps_supplycost", "ps_availqty", "ps_suppkey", "ps_partkey" ) -df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns( +df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select( "n_nationkey", "n_name" ) @@ -75,7 +75,7 @@ df = df.filter(col("value") / col("total_value") >= lit(FRACTION)) # We only need to report on these two columns -df = df.select_columns("ps_partkey", "value") +df = df.select("ps_partkey", "value") # Sort in descending order of value df = df.sort(col("value").sort(ascending=False)) diff --git a/examples/tpch/q12_ship_mode_order_priority.py b/examples/tpch/q12_ship_mode_order_priority.py index 150870c64..c3fc0d2e9 100644 --- a/examples/tpch/q12_ship_mode_order_priority.py +++ b/examples/tpch/q12_ship_mode_order_priority.py @@ -42,10 +42,10 @@ ctx = SessionContext() -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_orderkey", "o_orderpriority" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_orderkey", "l_shipmode", "l_commitdate", "l_shipdate", "l_receiptdate" ) diff --git a/examples/tpch/q13_customer_distribution.py b/examples/tpch/q13_customer_distribution.py index bc0a5bd1f..f8b6c139d 100644 --- a/examples/tpch/q13_customer_distribution.py +++ b/examples/tpch/q13_customer_distribution.py @@ -38,12 +38,10 @@ ctx = SessionContext() -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_custkey", "o_comment" ) -df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns( - "c_custkey" -) +df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select("c_custkey") # Use a regex to remove special cases df_orders = df_orders.filter( diff --git a/examples/tpch/q14_promotion_effect.py b/examples/tpch/q14_promotion_effect.py index 8cb1e4c5a..8224136ad 100644 --- a/examples/tpch/q14_promotion_effect.py +++ b/examples/tpch/q14_promotion_effect.py @@ -41,12 +41,10 @@ ctx = SessionContext() -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_partkey", "l_shipdate", "l_extendedprice", "l_discount" ) -df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns( - "p_partkey", "p_type" -) +df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey", "p_type") # Check part type begins with PROMO diff --git a/examples/tpch/q15_top_supplier.py b/examples/tpch/q15_top_supplier.py index aa76093ec..44d5dd997 100644 --- a/examples/tpch/q15_top_supplier.py +++ b/examples/tpch/q15_top_supplier.py @@ -41,10 +41,10 @@ ctx = SessionContext() -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_suppkey", "l_shipdate", "l_extendedprice", "l_discount" ) -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_suppkey", "s_name", "s_address", @@ -79,7 +79,7 @@ df = df.join(df_supplier, (["l_suppkey"], ["s_suppkey"]), "inner") # Return only the columns requested -df = df.select_columns("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue") +df = df.select("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue") # If we have more than one, sort by supplier number (suppkey) df = df.sort(col("s_suppkey").sort()) diff --git a/examples/tpch/q16_part_supplier_relationship.py b/examples/tpch/q16_part_supplier_relationship.py index fdcb5b4db..cbdd9989a 100644 --- a/examples/tpch/q16_part_supplier_relationship.py +++ b/examples/tpch/q16_part_supplier_relationship.py @@ -40,13 +40,13 @@ ctx = SessionContext() -df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns( +df_part = ctx.read_parquet(get_data_path("part.parquet")).select( "p_partkey", "p_brand", "p_type", "p_size" ) -df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns( +df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select( "ps_suppkey", "ps_partkey" ) -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_suppkey", "s_comment" ) @@ -75,7 +75,7 @@ df = df_part.join(df_partsupp, (["p_partkey"], ["ps_partkey"]), "inner") -df = df.select_columns("p_brand", "p_type", "p_size", "ps_suppkey").distinct() +df = df.select("p_brand", "p_type", "p_size", "ps_suppkey").distinct() df = df.aggregate( [col("p_brand"), col("p_type"), col("p_size")], diff --git a/examples/tpch/q17_small_quantity_order.py b/examples/tpch/q17_small_quantity_order.py index e0ee8bb90..ff494279b 100644 --- a/examples/tpch/q17_small_quantity_order.py +++ b/examples/tpch/q17_small_quantity_order.py @@ -38,10 +38,10 @@ ctx = SessionContext() -df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns( +df_part = ctx.read_parquet(get_data_path("part.parquet")).select( "p_partkey", "p_brand", "p_container" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_partkey", "l_quantity", "l_extendedprice" ) diff --git a/examples/tpch/q18_large_volume_customer.py b/examples/tpch/q18_large_volume_customer.py index 10c5f6e6a..497615499 100644 --- a/examples/tpch/q18_large_volume_customer.py +++ b/examples/tpch/q18_large_volume_customer.py @@ -35,13 +35,13 @@ ctx = SessionContext() -df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns( +df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select( "c_custkey", "c_name" ) -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_orderkey", "o_custkey", "o_orderdate", "o_totalprice" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_orderkey", "l_quantity", "l_extendedprice" ) @@ -57,7 +57,7 @@ df = df.join(df_orders, (["l_orderkey"], ["o_orderkey"]), "inner") df = df.join(df_customer, (["o_custkey"], ["c_custkey"]), "inner") -df = df.select_columns( +df = df.select( "c_name", "c_custkey", "o_orderkey", "o_orderdate", "o_totalprice", "total_quantity" ) diff --git a/examples/tpch/q19_discounted_revenue.py b/examples/tpch/q19_discounted_revenue.py index b15cd98bf..c2fe2570d 100644 --- a/examples/tpch/q19_discounted_revenue.py +++ b/examples/tpch/q19_discounted_revenue.py @@ -52,10 +52,10 @@ ctx = SessionContext() -df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns( +df_part = ctx.read_parquet(get_data_path("part.parquet")).select( "p_partkey", "p_brand", "p_container", "p_size" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_partkey", "l_quantity", "l_shipmode", diff --git a/examples/tpch/q20_potential_part_promotion.py b/examples/tpch/q20_potential_part_promotion.py index 4ced7aaa1..3a0edb1ec 100644 --- a/examples/tpch/q20_potential_part_promotion.py +++ b/examples/tpch/q20_potential_part_promotion.py @@ -40,19 +40,17 @@ ctx = SessionContext() -df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns( - "p_partkey", "p_name" -) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey", "p_name") +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_shipdate", "l_partkey", "l_suppkey", "l_quantity" ) -df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns( +df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select( "ps_partkey", "ps_suppkey", "ps_availqty" ) -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_suppkey", "s_address", "s_name", "s_nationkey" ) -df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns( +df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select( "n_nationkey", "n_name" ) @@ -91,7 +89,7 @@ df = df.join(df_nation, (["s_nationkey"], ["n_nationkey"]), "inner") # Restrict to the requested data per the problem statement -df = df.select_columns("s_name", "s_address").distinct() +df = df.select("s_name", "s_address").distinct() df = df.sort(col("s_name").sort()) diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py b/examples/tpch/q21_suppliers_kept_orders_waiting.py index 6b1679e7d..d3d57acee 100644 --- a/examples/tpch/q21_suppliers_kept_orders_waiting.py +++ b/examples/tpch/q21_suppliers_kept_orders_waiting.py @@ -35,16 +35,16 @@ ctx = SessionContext() -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select( "o_orderkey", "o_orderstatus" ) -df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns( +df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select( "l_orderkey", "l_receiptdate", "l_commitdate", "l_suppkey" ) -df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns( +df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select( "s_suppkey", "s_name", "s_nationkey" ) -df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns( +df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select( "n_nationkey", "n_name" ) diff --git a/examples/tpch/q22_global_sales_opportunity.py b/examples/tpch/q22_global_sales_opportunity.py index 41fd5de9e..e6660e60c 100644 --- a/examples/tpch/q22_global_sales_opportunity.py +++ b/examples/tpch/q22_global_sales_opportunity.py @@ -35,12 +35,10 @@ ctx = SessionContext() -df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns( +df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select( "c_phone", "c_acctbal", "c_custkey" ) -df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns( - "o_custkey" -) +df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select("o_custkey") # The nation code is a two digit number, but we need to convert it to a string literal nation_codes = F.make_array(*[lit(str(n)) for n in NATION_CODES]) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index e4f8073d3..e59f00d9f 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -98,6 +98,9 @@ def schema(self) -> pa.Schema: """ return self.df.schema() + @deprecated( + "select_columns() is deprecated. Use :py:meth:`~DataFrame.select` instead" + ) def select_columns(self, *args: str) -> DataFrame: """Filter the DataFrame by columns. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index d73f5ebde..7b20e9e39 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -103,30 +103,28 @@ def partitioned_df(): def test_select(df): - df = df.select( + df_1 = df.select( column("a") + column("b"), column("a") - column("b"), ) # execute and collect the first (and only) batch - result = df.collect()[0] + result = df_1.collect()[0] assert result.column(0) == pa.array([5, 7, 9]) assert result.column(1) == pa.array([-3, -3, -3]) - -def test_select_mixed_expr_string(df): - df = df.select_columns(column("b"), "a") + df_2 = df.select("b", "a") # execute and collect the first (and only) batch - result = df.collect()[0] + result = df_2.collect()[0] assert result.column(0) == pa.array([4, 5, 6]) assert result.column(1) == pa.array([1, 2, 3]) -def test_select_columns(df): - df = df.select_columns("b", "a") +def test_select_mixed_expr_string(df): + df = df.select(column("b"), "a") # execute and collect the first (and only) batch result = df.collect()[0] From 7007e0239f0a0dbe863936b3937cb8a787971a59 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:47:48 -0500 Subject: [PATCH 066/248] build(deps): bump uuid from 1.10.0 to 1.11.0 (#927) Bumps [uuid](https://github.com/uuid-rs/uuid) from 1.10.0 to 1.11.0. - [Release notes](https://github.com/uuid-rs/uuid/releases) - [Commits](https://github.com/uuid-rs/uuid/compare/1.10.0...1.11.0) --- updated-dependencies: - dependency-name: uuid dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 56c368f46..0835f219e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3406,9 +3406,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ "getrandom", "serde", diff --git a/Cargo.toml b/Cargo.toml index df72cd40a..073f82cf0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,7 +41,7 @@ datafusion = { version = "42.0.0", features = ["pyarrow", "avro", "unicode_expre datafusion-substrait = { version = "42.0.0", optional = true } datafusion-proto = { version = "42.0.0" } prost = "0.13" # keep in line with `datafusion-substrait` -uuid = { version = "1.9", features = ["v4"] } +uuid = { version = "1.11", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } async-trait = "0.1" futures = "0.3" From 0bc2f31d8d9ac46e1732d1267cb8d83c38452f45 Mon Sep 17 00:00:00 2001 From: kosiew Date: Tue, 29 Oct 2024 00:03:00 +0800 Subject: [PATCH 067/248] Add array_empty (#931) --- .../user-guide/common-operations/expressions.rst | 15 ++++++++++++++- python/datafusion/functions.py | 6 ++++++ python/tests/test_functions.py | 4 ++++ src/functions.rs | 2 ++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index 6014c9d2e..77f3359f5 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -77,12 +77,25 @@ approaches. df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5, 6]]}) df.select(col("a")[0].alias("a0")) - .. warning:: Indexing an element of an array via ``[]`` starts at index 0 whereas :py:func:`~datafusion.functions.array_element` starts at index 1. +To check if an array is empty, you can use the function :py:func:`datafusion.functions.array_empty`. +This function returns a boolean indicating whether the array is empty. + +.. ipython:: python + + from datafusion import SessionContext, col + from datafusion.functions import array_empty + + ctx = SessionContext() + df = ctx.from_pydict({"a": [[], [1, 2, 3]]}) + df.select(array_empty(col("a")).alias("is_empty")) + +In this example, the `is_empty` column will contain `True` for the first row and `False` for the second row. + Structs ------- diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 727321979..570a6ce5e 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -51,6 +51,7 @@ "array_dims", "array_distinct", "array_element", + "array_empty", "array_except", "array_extract", "array_has", @@ -1160,6 +1161,11 @@ def array_element(array: Expr, n: Expr) -> Expr: return Expr(f.array_element(array.expr, n.expr)) +def array_empty(array: Expr) -> Expr: + """Returns a boolean indicating whether the array is empty.""" + return Expr(f.array_empty(array.expr)) + + def array_extract(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 9353f872d..e6fd41d8b 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -309,6 +309,10 @@ def py_flatten(arr): lambda col: f.array_element(col, literal(1)), lambda data: [r[0] for r in data], ], + [ + lambda col: f.array_empty(col), + lambda data: [len(r) == 0 for r in data], + ], [ lambda col: f.array_extract(col, literal(1)), lambda data: [r[0] for r in data], diff --git a/src/functions.rs b/src/functions.rs index 24d33af39..4facb6cf7 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -572,6 +572,7 @@ array_fn!(array_to_string, array delimiter); array_fn!(array_dims, array); array_fn!(array_distinct, array); array_fn!(array_element, array element); +array_fn!(array_empty, array); array_fn!(array_length, array); array_fn!(array_has, first_array second_array); array_fn!(array_has_all, first_array second_array); @@ -1003,6 +1004,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(array_dims))?; m.add_wrapped(wrap_pyfunction!(array_distinct))?; m.add_wrapped(wrap_pyfunction!(array_element))?; + m.add_wrapped(wrap_pyfunction!(array_empty))?; m.add_wrapped(wrap_pyfunction!(array_length))?; m.add_wrapped(wrap_pyfunction!(array_has))?; m.add_wrapped(wrap_pyfunction!(array_has_all))?; From e015482750e9e08bd426bfcf649445d53705c51a Mon Sep 17 00:00:00 2001 From: kosiew Date: Tue, 29 Oct 2024 18:16:50 +0800 Subject: [PATCH 068/248] feat: add `cardinality` function to calculate total elements in an array (#937) --- .../common-operations/expressions.rst | 14 ++++++++++++++ python/datafusion/functions.py | 6 ++++++ python/tests/test_functions.py | 18 ++++++++++++++++++ src/functions.rs | 2 ++ 4 files changed, 40 insertions(+) diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index 77f3359f5..23430d359 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -96,6 +96,20 @@ This function returns a boolean indicating whether the array is empty. In this example, the `is_empty` column will contain `True` for the first row and `False` for the second row. +To get the total number of elements in an array, you can use the function :py:func:`datafusion.functions.cardinality`. +This function returns an integer indicating the total number of elements in the array. + +.. ipython:: python + + from datafusion import SessionContext, col + from datafusion.functions import cardinality + + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5, 6]]}) + df.select(cardinality(col("a")).alias("num_elements")) + +In this example, the `num_elements` column will contain `3` for both rows. + Structs ------- diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 570a6ce5e..e67ba4ae4 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -132,6 +132,7 @@ "find_in_set", "first_value", "flatten", + "cardinality", "floor", "from_unixtime", "gcd", @@ -1516,6 +1517,11 @@ def flatten(array: Expr) -> Expr: return Expr(f.flatten(array.expr)) +def cardinality(array: Expr) -> Expr: + """Returns the total number of elements in the array.""" + return Expr(f.cardinality(array.expr)) + + # aggregate functions def approx_distinct( expression: Expr, diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index e6fd41d8b..37943e57c 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -540,6 +540,24 @@ def test_array_function_flatten(): ) +def test_array_function_cardinality(): + data = [[1, 2, 3], [4, 4, 5, 6]] + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([np.array(data, dtype=object)], names=["arr"]) + df = ctx.create_dataframe([[batch]]) + + stmt = f.cardinality(column("arr")) + py_expr = [len(arr) for arr in data] # Expected lengths: [3, 3] + # assert py_expr lengths + + query_result = df.select(stmt).collect()[0].column(0) + + for a, b in zip(query_result, py_expr): + np.testing.assert_array_equal( + np.array([a.as_py()], dtype=int), np.array([b], dtype=int) + ) + + @pytest.mark.parametrize( ("stmt", "py_expr"), [ diff --git a/src/functions.rs b/src/functions.rs index 4facb6cf7..fe3531ba9 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -594,6 +594,7 @@ array_fn!(array_intersect, first_array second_array); array_fn!(array_union, array1 array2); array_fn!(array_except, first_array second_array); array_fn!(array_resize, array size value); +array_fn!(cardinality, array); array_fn!(flatten, array); array_fn!(range, start stop step); @@ -1030,6 +1031,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(array_sort))?; m.add_wrapped(wrap_pyfunction!(array_slice))?; m.add_wrapped(wrap_pyfunction!(flatten))?; + m.add_wrapped(wrap_pyfunction!(cardinality))?; // Window Functions m.add_wrapped(wrap_pyfunction!(lead))?; From aedffe0d8a0522fa21a7b545aa885750f32fc218 Mon Sep 17 00:00:00 2001 From: kosiew Date: Fri, 1 Nov 2024 19:42:57 +0800 Subject: [PATCH 069/248] Add empty scalar function (alias of array_empty), fix a small typo (#938) * feat: add `empty` function as alias of array_empty * fix: correct typo in null_treatment parameter documentation --- .../user-guide/common-operations/expressions.rst | 2 +- python/datafusion/functions.py | 12 +++++++++--- python/tests/test_functions.py | 4 ++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index 23430d359..b2a83c89f 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -82,7 +82,7 @@ approaches. Indexing an element of an array via ``[]`` starts at index 0 whereas :py:func:`~datafusion.functions.array_element` starts at index 1. -To check if an array is empty, you can use the function :py:func:`datafusion.functions.array_empty`. +To check if an array is empty, you can use the function :py:func:`datafusion.functions.array_empty` or `datafusion.functions.empty`. This function returns a boolean indicating whether the array is empty. .. ipython:: python diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index e67ba4ae4..907f801af 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -125,6 +125,7 @@ "decode", "degrees", "digest", + "empty", "encode", "ends_with", "exp", @@ -1522,6 +1523,11 @@ def cardinality(array: Expr) -> Expr: return Expr(f.cardinality(array.expr)) +def empty(array: Expr) -> Expr: + """This is an alias for :py:func:`array_empty`.""" + return array_empty(array) + + # aggregate functions def approx_distinct( expression: Expr, @@ -2140,7 +2146,7 @@ def first_value( expression: Argument to perform bitwise calculation on filter: If provided, only compute against rows for which the filter is True order_by: Set the ordering of the expression to evaluate - null_treatment: Assign whether to respect or ignull null values. + null_treatment: Assign whether to respect or ignore null values. """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -2172,7 +2178,7 @@ def last_value( expression: Argument to perform bitwise calculation on filter: If provided, only compute against rows for which the filter is True order_by: Set the ordering of the expression to evaluate - null_treatment: Assign whether to respect or ignull null values. + null_treatment: Assign whether to respect or ignore null values. """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -2206,7 +2212,7 @@ def nth_value( n: Index of value to return. Starts at 1. filter: If provided, only compute against rows for which the filter is True order_by: Set the ordering of the expression to evaluate - null_treatment: Assign whether to respect or ignull null values. + null_treatment: Assign whether to respect or ignore null values. """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 37943e57c..c65c633a4 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -313,6 +313,10 @@ def py_flatten(arr): lambda col: f.array_empty(col), lambda data: [len(r) == 0 for r in data], ], + [ + lambda col: f.empty(col), + lambda data: [len(r) == 0 for r in data], + ], [ lambda col: f.array_extract(col, literal(1)), lambda data: [r[0] for r in data], From cbe28cb4bb53c26940f5c020981592141030a324 Mon Sep 17 00:00:00 2001 From: David Rauschenbach Date: Tue, 5 Nov 2024 08:35:45 -0800 Subject: [PATCH 070/248] README How to develop section now also works on Apple M1 (#940) --- README.md | 2 + conda/environments/datafusion-cuda-dev.yaml | 44 +++++++++++++++++++++ conda/environments/datafusion-dev.yaml | 3 -- 3 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 conda/environments/datafusion-cuda-dev.yaml diff --git a/README.md b/README.md index b1d5397ef..83b307e7a 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,8 @@ conda env create -f ./conda/environments/datafusion-dev.yaml -n datafusion-dev conda activate datafusion-dev ``` +Or alternatively, if you are on an OS that supports CUDA Toolkit, you can use `-f ./conda/environments/datafusion-cuda-dev.yaml`. + Bootstrap (Pip): ```bash diff --git a/conda/environments/datafusion-cuda-dev.yaml b/conda/environments/datafusion-cuda-dev.yaml new file mode 100644 index 000000000..1f6f23942 --- /dev/null +++ b/conda/environments/datafusion-cuda-dev.yaml @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +channels: + - conda-forge +dependencies: + - black + - flake8 + - isort + - maturin>=1.5.1 + - mypy + - numpy + - pyarrow>=11.0.0 + - pytest + - toml + - importlib_metadata + - python>=3.10 + # Packages useful for building distributions and releasing + - mamba + - conda-build + - anaconda-client + # Packages for documentation building + - sphinx + - pydata-sphinx-theme==0.8.0 + - myst-parser + - jinja2 + # GPU packages + - cudf + - cudatoolkit=11.8 +name: datafusion-dev diff --git a/conda/environments/datafusion-dev.yaml b/conda/environments/datafusion-dev.yaml index 1f6f23942..b4b503dc6 100644 --- a/conda/environments/datafusion-dev.yaml +++ b/conda/environments/datafusion-dev.yaml @@ -38,7 +38,4 @@ dependencies: - pydata-sphinx-theme==0.8.0 - myst-parser - jinja2 - # GPU packages - - cudf - - cudatoolkit=11.8 name: datafusion-dev From 4a6c4d129af3e1eb207f64ee84285419afb26876 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Fri, 8 Nov 2024 15:06:27 +0100 Subject: [PATCH 071/248] refactor: dataframe `join` params (#912) * refactor: dataframe join params * chore: add description for on params * fix type * chore: change join param * chore: update join params in tpch * oops * chore: final change * Add support for join_keys as a positional argument --------- Co-authored-by: Tim Saucer --- .../user-guide/common-operations/joins.rst | 10 +- examples/tpch/_tests.py | 6 +- examples/tpch/q02_minimum_cost_supplier.py | 12 ++- examples/tpch/q03_shipping_priority.py | 6 +- examples/tpch/q04_order_priority_checking.py | 4 +- examples/tpch/q05_local_supplier_volume.py | 13 ++- examples/tpch/q07_volume_shipping.py | 12 ++- examples/tpch/q08_market_share.py | 14 +-- .../tpch/q09_product_type_profit_measure.py | 13 ++- examples/tpch/q10_returned_item_reporting.py | 6 +- .../q11_important_stock_identification.py | 6 +- examples/tpch/q12_ship_mode_order_priority.py | 2 +- examples/tpch/q13_customer_distribution.py | 4 +- examples/tpch/q14_promotion_effect.py | 4 +- examples/tpch/q15_top_supplier.py | 2 +- .../tpch/q16_part_supplier_relationship.py | 6 +- examples/tpch/q17_small_quantity_order.py | 2 +- examples/tpch/q18_large_volume_customer.py | 4 +- examples/tpch/q19_discounted_revenue.py | 2 +- examples/tpch/q20_potential_part_promotion.py | 11 ++- .../tpch/q21_suppliers_kept_orders_waiting.py | 8 +- examples/tpch/q22_global_sales_opportunity.py | 2 +- python/datafusion/dataframe.py | 95 +++++++++++++++++-- python/tests/test_dataframe.py | 66 ++++++++++++- src/dataframe.rs | 15 +-- 25 files changed, 240 insertions(+), 85 deletions(-) diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.rst index 09fa145a7..40d922150 100644 --- a/docs/source/user-guide/common-operations/joins.rst +++ b/docs/source/user-guide/common-operations/joins.rst @@ -56,7 +56,7 @@ will be included in the resulting DataFrame. .. ipython:: python - left.join(right, join_keys=(["customer_id"], ["id"]), how="inner") + left.join(right, left_on="customer_id", right_on="id", how="inner") The parameter ``join_keys`` specifies the columns from the left DataFrame and right DataFrame that contains the values that should match. @@ -70,7 +70,7 @@ values for the corresponding columns. .. ipython:: python - left.join(right, join_keys=(["customer_id"], ["id"]), how="left") + left.join(right, left_on="customer_id", right_on="id", how="left") Full Join --------- @@ -80,7 +80,7 @@ is no match. Unmatched rows will have null values. .. ipython:: python - left.join(right, join_keys=(["customer_id"], ["id"]), how="full") + left.join(right, left_on="customer_id", right_on="id", how="full") Left Semi Join -------------- @@ -90,7 +90,7 @@ omitting duplicates with multiple matches in the right table. .. ipython:: python - left.join(right, join_keys=(["customer_id"], ["id"]), how="semi") + left.join(right, left_on="customer_id", right_on="id", how="semi") Left Anti Join -------------- @@ -101,4 +101,4 @@ the right table. .. ipython:: python - left.join(right, join_keys=(["customer_id"], ["id"]), how="anti") \ No newline at end of file + left.join(right, left_on="customer_id", right_on="id", how="anti") \ No newline at end of file diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index 903b53548..13144ae9d 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -18,7 +18,7 @@ import pytest from importlib import import_module import pyarrow as pa -from datafusion import col, lit, functions as F +from datafusion import DataFrame, col, lit, functions as F from util import get_answer_file @@ -94,7 +94,7 @@ def check_q17(df): ) def test_tpch_query_vs_answer_file(query_code: str, answer_file: str): module = import_module(query_code) - df = module.df + df: DataFrame = module.df # Treat q17 as a special case. The answer file does not match the spec. # Running at scale factor 1, we have manually verified this result does @@ -121,5 +121,5 @@ def test_tpch_query_vs_answer_file(query_code: str, answer_file: str): cols = list(read_schema.names) - assert df.join(df_expected, (cols, cols), "anti").count() == 0 + assert df.join(df_expected, on=cols, how="anti").count() == 0 assert df.count() == df_expected.count() diff --git a/examples/tpch/q02_minimum_cost_supplier.py b/examples/tpch/q02_minimum_cost_supplier.py index 2440fdad6..c4ccf8ad3 100644 --- a/examples/tpch/q02_minimum_cost_supplier.py +++ b/examples/tpch/q02_minimum_cost_supplier.py @@ -80,16 +80,20 @@ # Now that we have the region, find suppliers in that region. Suppliers are tied to their nation # and nations are tied to the region. -df_nation = df_nation.join(df_region, (["n_regionkey"], ["r_regionkey"]), how="inner") +df_nation = df_nation.join( + df_region, left_on=["n_regionkey"], right_on=["r_regionkey"], how="inner" +) df_supplier = df_supplier.join( - df_nation, (["s_nationkey"], ["n_nationkey"]), how="inner" + df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner" ) # Now that we know who the potential suppliers are for the part, we can limit out part # supplies table down. We can further join down to the specific parts we've identified # as matching the request -df = df_partsupp.join(df_supplier, (["ps_suppkey"], ["s_suppkey"]), how="inner") +df = df_partsupp.join( + df_supplier, left_on=["ps_suppkey"], right_on=["s_suppkey"], how="inner" +) # Locate the minimum cost across all suppliers. There are multiple ways you could do this, # but one way is to create a window function across all suppliers, find the minimum, and @@ -111,7 +115,7 @@ df = df.filter(col("min_cost") == col("ps_supplycost")) -df = df.join(df_part, (["ps_partkey"], ["p_partkey"]), how="inner") +df = df.join(df_part, left_on=["ps_partkey"], right_on=["p_partkey"], how="inner") # From the problem statement, these are the values we wish to output diff --git a/examples/tpch/q03_shipping_priority.py b/examples/tpch/q03_shipping_priority.py index c4e8f461a..5ebab13c0 100644 --- a/examples/tpch/q03_shipping_priority.py +++ b/examples/tpch/q03_shipping_priority.py @@ -55,9 +55,9 @@ # Join all 3 dataframes -df = df_customer.join(df_orders, (["c_custkey"], ["o_custkey"]), how="inner").join( - df_lineitem, (["o_orderkey"], ["l_orderkey"]), how="inner" -) +df = df_customer.join( + df_orders, left_on=["c_custkey"], right_on=["o_custkey"], how="inner" +).join(df_lineitem, left_on=["o_orderkey"], right_on=["l_orderkey"], how="inner") # Compute the revenue diff --git a/examples/tpch/q04_order_priority_checking.py b/examples/tpch/q04_order_priority_checking.py index f10b74d91..8bf02cb83 100644 --- a/examples/tpch/q04_order_priority_checking.py +++ b/examples/tpch/q04_order_priority_checking.py @@ -66,7 +66,9 @@ ) # Perform the join to find only orders for which there are lineitems outside of expected range -df = df_orders.join(df_lineitem, (["o_orderkey"], ["l_orderkey"]), how="inner") +df = df_orders.join( + df_lineitem, left_on=["o_orderkey"], right_on=["l_orderkey"], how="inner" +) # Based on priority, find the number of entries df = df.aggregate( diff --git a/examples/tpch/q05_local_supplier_volume.py b/examples/tpch/q05_local_supplier_volume.py index 2a83d2d1a..413a4acb9 100644 --- a/examples/tpch/q05_local_supplier_volume.py +++ b/examples/tpch/q05_local_supplier_volume.py @@ -76,15 +76,18 @@ # Join all the dataframes df = ( - df_customer.join(df_orders, (["c_custkey"], ["o_custkey"]), how="inner") - .join(df_lineitem, (["o_orderkey"], ["l_orderkey"]), how="inner") + df_customer.join( + df_orders, left_on=["c_custkey"], right_on=["o_custkey"], how="inner" + ) + .join(df_lineitem, left_on=["o_orderkey"], right_on=["l_orderkey"], how="inner") .join( df_supplier, - (["l_suppkey", "c_nationkey"], ["s_suppkey", "s_nationkey"]), + left_on=["l_suppkey", "c_nationkey"], + right_on=["s_suppkey", "s_nationkey"], how="inner", ) - .join(df_nation, (["s_nationkey"], ["n_nationkey"]), how="inner") - .join(df_region, (["n_regionkey"], ["r_regionkey"]), how="inner") + .join(df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner") + .join(df_region, left_on=["n_regionkey"], right_on=["r_regionkey"], how="inner") ) # Compute the final result diff --git a/examples/tpch/q07_volume_shipping.py b/examples/tpch/q07_volume_shipping.py index a1d7d81ad..18c290d9c 100644 --- a/examples/tpch/q07_volume_shipping.py +++ b/examples/tpch/q07_volume_shipping.py @@ -90,20 +90,22 @@ # Limit suppliers to either nation df_supplier = df_supplier.join( - df_nation, (["s_nationkey"], ["n_nationkey"]), how="inner" + df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner" ).select(col("s_suppkey"), col("n_name").alias("supp_nation")) # Limit customers to either nation df_customer = df_customer.join( - df_nation, (["c_nationkey"], ["n_nationkey"]), how="inner" + df_nation, left_on=["c_nationkey"], right_on=["n_nationkey"], how="inner" ).select(col("c_custkey"), col("n_name").alias("cust_nation")) # Join up all the data frames from line items, and make sure the supplier and customer are in # different nations. df = ( - df_lineitem.join(df_orders, (["l_orderkey"], ["o_orderkey"]), how="inner") - .join(df_customer, (["o_custkey"], ["c_custkey"]), how="inner") - .join(df_supplier, (["l_suppkey"], ["s_suppkey"]), how="inner") + df_lineitem.join( + df_orders, left_on=["l_orderkey"], right_on=["o_orderkey"], how="inner" + ) + .join(df_customer, left_on=["o_custkey"], right_on=["c_custkey"], how="inner") + .join(df_supplier, left_on=["l_suppkey"], right_on=["s_suppkey"], how="inner") .filter(col("cust_nation") != col("supp_nation")) ) diff --git a/examples/tpch/q08_market_share.py b/examples/tpch/q08_market_share.py index 95fc0a871..7138ab65a 100644 --- a/examples/tpch/q08_market_share.py +++ b/examples/tpch/q08_market_share.py @@ -89,27 +89,27 @@ # After this join we have all of the possible sales nations df_regional_customers = df_regional_customers.join( - df_nation, (["r_regionkey"], ["n_regionkey"]), how="inner" + df_nation, left_on=["r_regionkey"], right_on=["n_regionkey"], how="inner" ) # Now find the possible customers df_regional_customers = df_regional_customers.join( - df_customer, (["n_nationkey"], ["c_nationkey"]), how="inner" + df_customer, left_on=["n_nationkey"], right_on=["c_nationkey"], how="inner" ) # Next find orders for these customers df_regional_customers = df_regional_customers.join( - df_orders, (["c_custkey"], ["o_custkey"]), how="inner" + df_orders, left_on=["c_custkey"], right_on=["o_custkey"], how="inner" ) # Find all line items from these orders df_regional_customers = df_regional_customers.join( - df_lineitem, (["o_orderkey"], ["l_orderkey"]), how="inner" + df_lineitem, left_on=["o_orderkey"], right_on=["l_orderkey"], how="inner" ) # Limit to the part of interest df_regional_customers = df_regional_customers.join( - df_part, (["l_partkey"], ["p_partkey"]), how="inner" + df_part, left_on=["l_partkey"], right_on=["p_partkey"], how="inner" ) # Compute the volume for each line item @@ -126,7 +126,7 @@ # Determine the suppliers by the limited nation key we have in our single row df above df_national_suppliers = df_national_suppliers.join( - df_supplier, (["n_nationkey"], ["s_nationkey"]), how="inner" + df_supplier, left_on=["n_nationkey"], right_on=["s_nationkey"], how="inner" ) # When we join to the customer dataframe, we don't want to confuse other columns, so only @@ -141,7 +141,7 @@ # column only from suppliers in the nation we are evaluating. df = df_regional_customers.join( - df_national_suppliers, (["l_suppkey"], ["s_suppkey"]), how="left" + df_national_suppliers, left_on=["l_suppkey"], right_on=["s_suppkey"], how="left" ) # Use a case statement to compute the volume sold by suppliers in the nation of interest diff --git a/examples/tpch/q09_product_type_profit_measure.py b/examples/tpch/q09_product_type_profit_measure.py index 0295d3025..aa47d76c0 100644 --- a/examples/tpch/q09_product_type_profit_measure.py +++ b/examples/tpch/q09_product_type_profit_measure.py @@ -65,13 +65,16 @@ df = df_part.filter(F.strpos(col("p_name"), part_color) > lit(0)) # We have a series of joins that get us to limit down to the line items we need -df = df.join(df_lineitem, (["p_partkey"], ["l_partkey"]), how="inner") -df = df.join(df_supplier, (["l_suppkey"], ["s_suppkey"]), how="inner") -df = df.join(df_orders, (["l_orderkey"], ["o_orderkey"]), how="inner") +df = df.join(df_lineitem, left_on=["p_partkey"], right_on=["l_partkey"], how="inner") +df = df.join(df_supplier, left_on=["l_suppkey"], right_on=["s_suppkey"], how="inner") +df = df.join(df_orders, left_on=["l_orderkey"], right_on=["o_orderkey"], how="inner") df = df.join( - df_partsupp, (["l_suppkey", "l_partkey"], ["ps_suppkey", "ps_partkey"]), how="inner" + df_partsupp, + left_on=["l_suppkey", "l_partkey"], + right_on=["ps_suppkey", "ps_partkey"], + how="inner", ) -df = df.join(df_nation, (["s_nationkey"], ["n_nationkey"]), how="inner") +df = df.join(df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner") # Compute the intermediate values and limit down to the expressions we need df = df.select( diff --git a/examples/tpch/q10_returned_item_reporting.py b/examples/tpch/q10_returned_item_reporting.py index 25f81b2ff..94b398c1d 100644 --- a/examples/tpch/q10_returned_item_reporting.py +++ b/examples/tpch/q10_returned_item_reporting.py @@ -74,7 +74,7 @@ col("o_orderdate") < date_start_of_quarter + interval_one_quarter ) -df = df.join(df_lineitem, (["o_orderkey"], ["l_orderkey"]), how="inner") +df = df.join(df_lineitem, left_on=["o_orderkey"], right_on=["l_orderkey"], how="inner") # Compute the revenue df = df.aggregate( @@ -83,8 +83,8 @@ ) # Now join in the customer data -df = df.join(df_customer, (["o_custkey"], ["c_custkey"]), how="inner") -df = df.join(df_nation, (["c_nationkey"], ["n_nationkey"]), how="inner") +df = df.join(df_customer, left_on=["o_custkey"], right_on=["c_custkey"], how="inner") +df = df.join(df_nation, left_on=["c_nationkey"], right_on=["n_nationkey"], how="inner") # These are the columns the problem statement requires df = df.select( diff --git a/examples/tpch/q11_important_stock_identification.py b/examples/tpch/q11_important_stock_identification.py index 86ff2296b..707265e16 100644 --- a/examples/tpch/q11_important_stock_identification.py +++ b/examples/tpch/q11_important_stock_identification.py @@ -52,9 +52,11 @@ # Find part supplies of within this target nation -df = df_nation.join(df_supplier, (["n_nationkey"], ["s_nationkey"]), how="inner") +df = df_nation.join( + df_supplier, left_on=["n_nationkey"], right_on=["s_nationkey"], how="inner" +) -df = df.join(df_partsupp, (["s_suppkey"], ["ps_suppkey"]), how="inner") +df = df.join(df_partsupp, left_on=["s_suppkey"], right_on=["ps_suppkey"], how="inner") # Compute the value of individual parts diff --git a/examples/tpch/q12_ship_mode_order_priority.py b/examples/tpch/q12_ship_mode_order_priority.py index c3fc0d2e9..def2a6c30 100644 --- a/examples/tpch/q12_ship_mode_order_priority.py +++ b/examples/tpch/q12_ship_mode_order_priority.py @@ -75,7 +75,7 @@ # We need order priority, so join order df to line item -df = df.join(df_orders, (["l_orderkey"], ["o_orderkey"]), how="inner") +df = df.join(df_orders, left_on=["l_orderkey"], right_on=["o_orderkey"], how="inner") # Restrict to line items we care about based on the problem statement. df = df.filter(col("l_commitdate") < col("l_receiptdate")) diff --git a/examples/tpch/q13_customer_distribution.py b/examples/tpch/q13_customer_distribution.py index f8b6c139d..67365a96a 100644 --- a/examples/tpch/q13_customer_distribution.py +++ b/examples/tpch/q13_customer_distribution.py @@ -49,7 +49,9 @@ ) # Since we may have customers with no orders we must do a left join -df = df_customer.join(df_orders, (["c_custkey"], ["o_custkey"]), how="left") +df = df_customer.join( + df_orders, left_on=["c_custkey"], right_on=["o_custkey"], how="left" +) # Find the number of orders for each customer df = df.aggregate([col("c_custkey")], [F.count(col("o_custkey")).alias("c_count")]) diff --git a/examples/tpch/q14_promotion_effect.py b/examples/tpch/q14_promotion_effect.py index 8224136ad..cd26ee2bd 100644 --- a/examples/tpch/q14_promotion_effect.py +++ b/examples/tpch/q14_promotion_effect.py @@ -57,7 +57,9 @@ ) # Left join so we can sum up the promo parts different from other parts -df = df_lineitem.join(df_part, (["l_partkey"], ["p_partkey"]), "left") +df = df_lineitem.join( + df_part, left_on=["l_partkey"], right_on=["p_partkey"], how="left" +) # Make a factor of 1.0 if it is a promotion, 0.0 otherwise df = df.with_column("promo_factor", F.coalesce(col("promo_factor"), lit(0.0))) diff --git a/examples/tpch/q15_top_supplier.py b/examples/tpch/q15_top_supplier.py index 44d5dd997..0bc316f7a 100644 --- a/examples/tpch/q15_top_supplier.py +++ b/examples/tpch/q15_top_supplier.py @@ -76,7 +76,7 @@ # Now that we know the supplier(s) with maximum revenue, get the rest of their information # from the supplier table -df = df.join(df_supplier, (["l_suppkey"], ["s_suppkey"]), "inner") +df = df.join(df_supplier, left_on=["l_suppkey"], right_on=["s_suppkey"], how="inner") # Return only the columns requested df = df.select("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue") diff --git a/examples/tpch/q16_part_supplier_relationship.py b/examples/tpch/q16_part_supplier_relationship.py index cbdd9989a..a6a0c43eb 100644 --- a/examples/tpch/q16_part_supplier_relationship.py +++ b/examples/tpch/q16_part_supplier_relationship.py @@ -56,7 +56,7 @@ # Remove unwanted suppliers df_partsupp = df_partsupp.join( - df_unwanted_suppliers, (["ps_suppkey"], ["s_suppkey"]), "anti" + df_unwanted_suppliers, left_on=["ps_suppkey"], right_on=["s_suppkey"], how="anti" ) # Select the parts we are interested in @@ -73,7 +73,9 @@ p_sizes = F.make_array(*[lit(s).cast(pa.int32()) for s in SIZES_OF_INTEREST]) df_part = df_part.filter(~F.array_position(p_sizes, col("p_size")).is_null()) -df = df_part.join(df_partsupp, (["p_partkey"], ["ps_partkey"]), "inner") +df = df_part.join( + df_partsupp, left_on=["p_partkey"], right_on=["ps_partkey"], how="inner" +) df = df.select("p_brand", "p_type", "p_size", "ps_suppkey").distinct() diff --git a/examples/tpch/q17_small_quantity_order.py b/examples/tpch/q17_small_quantity_order.py index ff494279b..d7b43d498 100644 --- a/examples/tpch/q17_small_quantity_order.py +++ b/examples/tpch/q17_small_quantity_order.py @@ -51,7 +51,7 @@ ) # Combine data -df = df.join(df_lineitem, (["p_partkey"], ["l_partkey"]), "inner") +df = df.join(df_lineitem, left_on=["p_partkey"], right_on=["l_partkey"], how="inner") # Find the average quantity window_frame = WindowFrame("rows", None, None) diff --git a/examples/tpch/q18_large_volume_customer.py b/examples/tpch/q18_large_volume_customer.py index 497615499..165fce033 100644 --- a/examples/tpch/q18_large_volume_customer.py +++ b/examples/tpch/q18_large_volume_customer.py @@ -54,8 +54,8 @@ # We've identified the orders of interest, now join the additional data # we are required to report on -df = df.join(df_orders, (["l_orderkey"], ["o_orderkey"]), "inner") -df = df.join(df_customer, (["o_custkey"], ["c_custkey"]), "inner") +df = df.join(df_orders, left_on=["l_orderkey"], right_on=["o_orderkey"], how="inner") +df = df.join(df_customer, left_on=["o_custkey"], right_on=["c_custkey"], how="inner") df = df.select( "c_name", "c_custkey", "o_orderkey", "o_orderdate", "o_totalprice", "total_quantity" diff --git a/examples/tpch/q19_discounted_revenue.py b/examples/tpch/q19_discounted_revenue.py index c2fe2570d..4aed0cbae 100644 --- a/examples/tpch/q19_discounted_revenue.py +++ b/examples/tpch/q19_discounted_revenue.py @@ -72,7 +72,7 @@ (col("l_shipmode") == lit("AIR")) | (col("l_shipmode") == lit("AIR REG")) ) -df = df.join(df_part, (["l_partkey"], ["p_partkey"]), "inner") +df = df.join(df_part, left_on=["l_partkey"], right_on=["p_partkey"], how="inner") # Create the user defined function (UDF) definition that does the work diff --git a/examples/tpch/q20_potential_part_promotion.py b/examples/tpch/q20_potential_part_promotion.py index 3a0edb1ec..d720cdce6 100644 --- a/examples/tpch/q20_potential_part_promotion.py +++ b/examples/tpch/q20_potential_part_promotion.py @@ -70,7 +70,7 @@ ) # This will filter down the line items to the parts of interest -df = df.join(df_part, (["l_partkey"], ["p_partkey"]), "inner") +df = df.join(df_part, left_on="l_partkey", right_on="p_partkey", how="inner") # Compute the total sold and limit ourselves to individual supplier/part combinations df = df.aggregate( @@ -78,15 +78,18 @@ ) df = df.join( - df_partsupp, (["l_partkey", "l_suppkey"], ["ps_partkey", "ps_suppkey"]), "inner" + df_partsupp, + left_on=["l_partkey", "l_suppkey"], + right_on=["ps_partkey", "ps_suppkey"], + how="inner", ) # Find cases of excess quantity df.filter(col("ps_availqty") > lit(0.5) * col("total_sold")) # We could do these joins earlier, but now limit to the nation of interest suppliers -df = df.join(df_supplier, (["ps_suppkey"], ["s_suppkey"]), "inner") -df = df.join(df_nation, (["s_nationkey"], ["n_nationkey"]), "inner") +df = df.join(df_supplier, left_on=["ps_suppkey"], right_on=["s_suppkey"], how="inner") +df = df.join(df_nation, left_on=["s_nationkey"], right_on=["n_nationkey"], how="inner") # Restrict to the requested data per the problem statement df = df.select("s_name", "s_address").distinct() diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py b/examples/tpch/q21_suppliers_kept_orders_waiting.py index d3d57acee..27cf816fa 100644 --- a/examples/tpch/q21_suppliers_kept_orders_waiting.py +++ b/examples/tpch/q21_suppliers_kept_orders_waiting.py @@ -52,13 +52,13 @@ df_suppliers_of_interest = df_nation.filter(col("n_name") == lit(NATION_OF_INTEREST)) df_suppliers_of_interest = df_suppliers_of_interest.join( - df_supplier, (["n_nationkey"], ["s_nationkey"]), "inner" + df_supplier, left_on="n_nationkey", right_on="s_nationkey", how="inner" ) # Find the failed orders and all their line items df = df_orders.filter(col("o_orderstatus") == lit("F")) -df = df_lineitem.join(df, (["l_orderkey"], ["o_orderkey"]), "inner") +df = df_lineitem.join(df, left_on="l_orderkey", right_on="o_orderkey", how="inner") # Identify the line items for which the order is failed due to. df = df.with_column( @@ -102,7 +102,9 @@ ) # Join to the supplier of interest list for the nation of interest -df = df.join(df_suppliers_of_interest, (["suppkey"], ["s_suppkey"]), "inner") +df = df.join( + df_suppliers_of_interest, left_on=["suppkey"], right_on=["s_suppkey"], how="inner" +) # Count how many orders that supplier is the only failed supplier for df = df.aggregate([col("s_name")], [F.count(col("o_orderkey")).alias("numwait")]) diff --git a/examples/tpch/q22_global_sales_opportunity.py b/examples/tpch/q22_global_sales_opportunity.py index e6660e60c..72dce5289 100644 --- a/examples/tpch/q22_global_sales_opportunity.py +++ b/examples/tpch/q22_global_sales_opportunity.py @@ -62,7 +62,7 @@ df = df.filter(col("c_acctbal") > col("avg_balance")) # Limit results to customers with no orders -df = df.join(df_orders, (["c_custkey"], ["o_custkey"]), "anti") +df = df.join(df_orders, left_on="c_custkey", right_on="o_custkey", how="anti") # Count up the customers and the balances df = df.aggregate( diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index e59f00d9f..efd4038ae 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -20,9 +20,8 @@ """ from __future__ import annotations - - -from typing import Any, Iterable, List, Literal, TYPE_CHECKING +import warnings +from typing import Any, Iterable, List, TYPE_CHECKING, Literal, overload from datafusion.record_batch import RecordBatchStream from typing_extensions import deprecated from datafusion.plan import LogicalPlan, ExecutionPlan @@ -32,7 +31,7 @@ import pandas as pd import polars as pl import pathlib - from typing import Callable + from typing import Callable, Sequence from datafusion._internal import DataFrame as DataFrameInternal from datafusion.expr import Expr, SortExpr, sort_or_default @@ -368,27 +367,105 @@ def distinct(self) -> DataFrame: """ return DataFrame(self.df.distinct()) + @overload + def join( + self, + right: DataFrame, + on: str | Sequence[str], + how: Literal["inner", "left", "right", "full", "semi", "anti"] = "inner", + *, + left_on: None = None, + right_on: None = None, + join_keys: None = None, + ) -> DataFrame: ... + + @overload def join( self, right: DataFrame, + on: None = None, + how: Literal["inner", "left", "right", "full", "semi", "anti"] = "inner", + *, + left_on: str | Sequence[str], + right_on: str | Sequence[str], + join_keys: tuple[list[str], list[str]] | None = None, + ) -> DataFrame: ... + + @overload + def join( + self, + right: DataFrame, + on: None = None, + how: Literal["inner", "left", "right", "full", "semi", "anti"] = "inner", + *, join_keys: tuple[list[str], list[str]], - how: str, + left_on: None = None, + right_on: None = None, + ) -> DataFrame: ... + + def join( + self, + right: DataFrame, + on: str | Sequence[str] | tuple[list[str], list[str]] | None = None, + how: Literal["inner", "left", "right", "full", "semi", "anti"] = "inner", + *, + left_on: str | Sequence[str] | None = None, + right_on: str | Sequence[str] | None = None, + join_keys: tuple[list[str], list[str]] | None = None, ) -> DataFrame: """Join this :py:class:`DataFrame` with another :py:class:`DataFrame`. - Join keys are a pair of lists of column names in the left and right - dataframes, respectively. These lists must have the same length. + `on` has to be provided or both `left_on` and `right_on` in conjunction. Args: right: Other DataFrame to join with. - join_keys: Tuple of two lists of column names to join on. + on: Column names to join on in both dataframes. how: Type of join to perform. Supported types are "inner", "left", "right", "full", "semi", "anti". + left_on: Join column of the left dataframe. + right_on: Join column of the right dataframe. + join_keys: Tuple of two lists of column names to join on. [Deprecated] Returns: DataFrame after join. """ - return DataFrame(self.df.join(right.df, join_keys, how)) + # This check is to prevent breaking API changes where users prior to + # DF 43.0.0 would pass the join_keys as a positional argument instead + # of a keyword argument. + if isinstance(on, tuple) and len(on) == 2: + if isinstance(on[0], list) and isinstance(on[1], list): + join_keys = on # type: ignore + on = None + + if join_keys is not None: + warnings.warn( + "`join_keys` is deprecated, use `on` or `left_on` with `right_on`", + category=DeprecationWarning, + stacklevel=2, + ) + left_on = join_keys[0] + right_on = join_keys[1] + + if on: + if left_on or right_on: + raise ValueError( + "`left_on` or `right_on` should not provided with `on`" + ) + left_on = on + right_on = on + elif left_on or right_on: + if left_on is None or right_on is None: + raise ValueError("`left_on` and `right_on` should both be provided.") + else: + raise ValueError( + "either `on` or `left_on` and `right_on` should be provided." + ) + if isinstance(left_on, str): + left_on = [left_on] + if isinstance(right_on, str): + right_on = [right_on] + + return DataFrame(self.df.join(right.df, how, left_on, right_on)) def join_on( self, diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 7b20e9e39..330475302 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -321,14 +321,72 @@ def test_join(): ) df1 = ctx.create_dataframe([[batch]], "r") - df = df.join(df1, join_keys=(["a"], ["a"]), how="inner") - df.show() - df = df.sort(column("l.a")) - table = pa.Table.from_batches(df.collect()) + df2 = df.join(df1, on="a", how="inner") + df2.show() + df2 = df2.sort(column("l.a")) + table = pa.Table.from_batches(df2.collect()) + + expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} + assert table.to_pydict() == expected + + df2 = df.join(df1, left_on="a", right_on="a", how="inner") + df2.show() + df2 = df2.sort(column("l.a")) + table = pa.Table.from_batches(df2.collect()) expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} assert table.to_pydict() == expected + # Verify we don't make a breaking change to pre-43.0.0 + # where users would pass join_keys as a positional argument + df2 = df.join(df1, (["a"], ["a"]), how="inner") # type: ignore + df2.show() + df2 = df2.sort(column("l.a")) + table = pa.Table.from_batches(df2.collect()) + + expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} + assert table.to_pydict() == expected + + +def test_join_invalid_params(): + ctx = SessionContext() + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]], "l") + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2]), pa.array([8, 10])], + names=["a", "c"], + ) + df1 = ctx.create_dataframe([[batch]], "r") + + with pytest.deprecated_call(): + df2 = df.join(df1, join_keys=(["a"], ["a"]), how="inner") + df2.show() + df2 = df2.sort(column("l.a")) + table = pa.Table.from_batches(df2.collect()) + + expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} + assert table.to_pydict() == expected + + with pytest.raises( + ValueError, match=r"`left_on` or `right_on` should not provided with `on`" + ): + df2 = df.join(df1, on="a", how="inner", right_on="test") # type: ignore + + with pytest.raises( + ValueError, match=r"`left_on` and `right_on` should both be provided." + ): + df2 = df.join(df1, left_on="a", how="inner") # type: ignore + + with pytest.raises( + ValueError, match=r"either `on` or `left_on` and `right_on` should be provided." + ): + df2 = df.join(df1, how="inner") # type: ignore + def test_join_on(): ctx = SessionContext() diff --git a/src/dataframe.rs b/src/dataframe.rs index dd5d89ce9..ee8fbbf9d 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -271,8 +271,9 @@ impl PyDataFrame { fn join( &self, right: PyDataFrame, - join_keys: (Vec, Vec), how: &str, + left_on: Vec, + right_on: Vec, ) -> PyResult { let join_type = match how { "inner" => JoinType::Inner, @@ -289,16 +290,8 @@ impl PyDataFrame { } }; - let left_keys = join_keys - .0 - .iter() - .map(|s| s.as_ref()) - .collect::>(); - let right_keys = join_keys - .1 - .iter() - .map(|s| s.as_ref()) - .collect::>(); + let left_keys = left_on.iter().map(|s| s.as_ref()).collect::>(); + let right_keys = right_on.iter().map(|s| s.as_ref()).collect::>(); let df = self.df.as_ref().clone().join( right.df.as_ref().clone(), From 3c662010f9d133cda65749b119a1b1731edbe4e5 Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Sun, 10 Nov 2024 11:02:01 -0600 Subject: [PATCH 072/248] Upgrade to Datafusion 43 (#905) * patch datafusion deps * migrate from deprecated RuntimeEnv::new to RuntimeEnv::try_new Ref: https://github.com/apache/datafusion/pull/12566 * remove Arc from create_udf call Ref: https://github.com/apache/datafusion/pull/12489 * doc typo * migrage new UnnestOptions API Ref: https://github.com/apache/datafusion/pull/12836/files * update API for logical expr Limit Ref: https://github.com/apache/datafusion/pull/12836 * remove logical expr CrossJoin It was removed upstream. Ref: https://github.com/apache/datafusion/pull/13076 * update PyWindowUDF Ref: https://github.com/apache/datafusion/issues/12803 * migrate window functions lead and lag to udwf Ref: https://github.com/apache/datafusion/issues/12802 * migrate window functions rank, dense_rank, and percent_rank to udwf Ref: https://github.com/apache/datafusion/issues/12648 * convert window function cume_dist to udwf Ref: https://github.com/apache/datafusion/issues/12695 * convert window function ntile to udwf Ref: https://github.com/apache/datafusion/issues/12694 * clean up functions_window invocation * Only one column was being passed to udwf * Update to DF 43.0.0 * Update tests to look for string_view type * String view is now the default type for strings * Making a variety of adjustments in wrappers and unit tests to account for the switch from string to string_view as default * Resolve errors in doc building --------- Co-authored-by: Tim Saucer --- Cargo.lock | 373 ++++++++++++++++++--------------- Cargo.toml | 9 +- examples/tpch/_tests.py | 4 +- python/datafusion/expr.py | 4 +- python/datafusion/functions.py | 11 +- python/datafusion/udf.py | 1 + python/tests/test_expr.py | 16 +- python/tests/test_functions.py | 67 ++++-- python/tests/test_imports.py | 2 - python/tests/test_sql.py | 7 + src/context.rs | 2 +- src/dataframe.rs | 8 +- src/expr.rs | 2 - src/expr/cross_join.rs | 94 --------- src/expr/limit.rs | 22 +- src/functions.rs | 18 +- src/sql/logical.rs | 3 +- src/udf.rs | 4 +- src/udwf.rs | 29 ++- 19 files changed, 338 insertions(+), 338 deletions(-) delete mode 100644 src/expr/cross_join.rs diff --git a/Cargo.lock b/Cargo.lock index 0835f219e..497c5b850 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,9 +84,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.89" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" +checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8" [[package]] name = "apache-avro" @@ -130,9 +130,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9ba0d7248932f4e2a12fb37f0a2e3ec82b3bdedbac2a1dce186e036843b8f8c" +checksum = "4caf25cdc4a985f91df42ed9e9308e1adbcd341a31a72605c697033fcef163e3" dependencies = [ "arrow-arith", "arrow-array", @@ -152,9 +152,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d60afcdc004841a5c8d8da4f4fa22d64eb19c0c01ef4bcedd77f175a7cf6e38f" +checksum = "91f2dfd1a7ec0aca967dfaa616096aec49779adc8eccec005e2f5e4111b1192a" dependencies = [ "arrow-array", "arrow-buffer", @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f16835e8599dbbb1659fd869d865254c4cf32c6c2bb60b6942ac9fc36bfa5da" +checksum = "d39387ca628be747394890a6e47f138ceac1aa912eab64f02519fed24b637af8" dependencies = [ "ahash", "arrow-buffer", @@ -184,9 +184,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a1f34f0faae77da6b142db61deba2cb6d60167592b178be317b341440acba80" +checksum = "9e51e05228852ffe3eb391ce7178a0f97d2cf80cc6ef91d3c4a6b3cb688049ec" dependencies = [ "bytes", "half", @@ -195,9 +195,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "450e4abb5775bca0740bec0bcf1b1a5ae07eff43bd625661c4436d8e8e4540c4" +checksum = "d09aea56ec9fa267f3f3f6cdab67d8a9974cbba90b3aa38c8fe9d0bb071bd8c1" dependencies = [ "arrow-array", "arrow-buffer", @@ -216,9 +216,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3a4e4d63830a341713e35d9a42452fbc6241d5f42fa5cf6a4681b8ad91370c4" +checksum = "c07b5232be87d115fde73e32f2ca7f1b353bff1b44ac422d3c6fc6ae38f11f0d" dependencies = [ "arrow-array", "arrow-buffer", @@ -235,9 +235,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b1e618bbf714c7a9e8d97203c806734f012ff71ae3adc8ad1b075689f540634" +checksum = "b98ae0af50890b494cebd7d6b04b35e896205c1d1df7b29a6272c5d0d0249ef5" dependencies = [ "arrow-buffer", "arrow-schema", @@ -247,9 +247,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98e983549259a2b97049af7edfb8f28b8911682040e99a94e4ceb1196bd65c2" +checksum = "0ed91bdeaff5a1c00d28d8f73466bcb64d32bbd7093b5a30156b4b9f4dba3eee" dependencies = [ "arrow-array", "arrow-buffer", @@ -262,9 +262,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b198b9c6fcf086501730efbbcb483317b39330a116125af7bb06467d04b352a3" +checksum = "0471f51260a5309307e5d409c9dc70aede1cd9cf1d4ff0f0a1e8e1a2dd0e0d3c" dependencies = [ "arrow-array", "arrow-buffer", @@ -282,9 +282,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2427f37b4459a4b9e533045abe87a5183a5e0995a3fc2c2fd45027ae2cc4ef3f" +checksum = "2883d7035e0b600fb4c30ce1e50e66e53d8656aa729f2bfa4b51d359cf3ded52" dependencies = [ "arrow-array", "arrow-buffer", @@ -297,9 +297,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15959657d92e2261a7a323517640af87f5afd9fd8a6492e424ebee2203c567f6" +checksum = "552907e8e587a6fde4f8843fd7a27a576a260f65dab6c065741ea79f633fc5be" dependencies = [ "ahash", "arrow-array", @@ -311,18 +311,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbf0388a18fd7f7f3fe3de01852d30f54ed5182f9004db700fbe3ba843ed2794" +checksum = "539ada65246b949bd99ffa0881a9a15a4a529448af1a07a9838dd78617dafab1" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "arrow-select" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b83e5723d307a38bf00ecd2972cd078d1339c7fd3eb044f609958a9a24463f3a" +checksum = "6259e566b752da6dceab91766ed8b2e67bf6270eb9ad8a6e07a33c1bede2b125" dependencies = [ "ahash", "arrow-array", @@ -334,9 +334,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ab3db7c09dd826e74079661d84ed01ed06547cf75d52c2818ef776d0d852305" +checksum = "f3179ccbd18ebf04277a095ba7321b93fd1f774f18816bd5f6b3ce2f594edb6c" dependencies = [ "arrow-array", "arrow-buffer", @@ -351,9 +351,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.13" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429" +checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" dependencies = [ "bzip2", "flate2", @@ -482,9 +482,9 @@ dependencies = [ [[package]] name = "brotli" -version = "6.0.0" +version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b" +checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -515,9 +515,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.7.2" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" [[package]] name = "bzip2" @@ -542,9 +542,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.28" +version = "1.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1" +checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" dependencies = [ "jobserver", "libc", @@ -557,6 +557,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "chrono" version = "0.4.38" @@ -725,9 +731,9 @@ dependencies = [ [[package]] name = "dary_heap" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" [[package]] name = "dashmap" @@ -745,9 +751,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee907b081e45e1d14e1f327e89ef134f91fcebad0bfc2dc229fa9f6044379682" +checksum = "cbba0799cf6913b456ed07a94f0f3b6e12c62a5d88b10809e2284a0f2b915c05" dependencies = [ "ahash", "apache-avro", @@ -804,9 +810,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c2b914f6e33c429af7d8696c72a47ed9225d7e2b82c747ebdfa2408ed53579f" +checksum = "7493c5c2d40eec435b13d92e5703554f4efc7059451fcb8d3a79580ff0e45560" dependencies = [ "arrow-schema", "async-trait", @@ -819,9 +825,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a84f8e76330c582a6b8ada0b2c599ca46cfe46b7585e458fc3f4092bc722a18" +checksum = "24953049ebbd6f8964f91f60aa3514e121b5e81e068e33b60e77815ab369b25c" dependencies = [ "ahash", "apache-avro", @@ -832,6 +838,7 @@ dependencies = [ "chrono", "half", "hashbrown 0.14.5", + "indexmap", "instant", "libc", "num_cpus", @@ -845,9 +852,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf08cc30d92720d557df13bd5a5696213bd5ea0f38a866d8d85055d866fba774" +checksum = "f06df4ef76872e11c924d3c814fd2a8dd09905ed2e2195f71c857d78abd19685" dependencies = [ "log", "tokio", @@ -855,9 +862,9 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86bc4183d5c45b9f068a6f351678a0d1eb1225181424542bb75db18ec280b822" +checksum = "6bbdcb628d690f3ce5fea7de81642b514486d58ff9779a51f180a69a4eadb361" dependencies = [ "arrow", "chrono", @@ -876,9 +883,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "202119ce58e4d103e37ae64aab40d4e574c97bdd2bea994bf307b175fcbfa74d" +checksum = "8036495980e3131f706b7d33ab00b4492d73dc714e3cb74d11b50f9602a73246" dependencies = [ "ahash", "arrow", @@ -888,7 +895,9 @@ dependencies = [ "datafusion-common", "datafusion-expr-common", "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", "datafusion-physical-expr-common", + "indexmap", "paste", "serde_json", "sqlparser", @@ -898,20 +907,21 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8b181ce8569216abb01ef3294aa16c0a40d7d39350c2ff01ede00f167a535f2" +checksum = "4da0f3cb4669f9523b403d6b5a0ec85023e0ab3bf0183afd1517475b3e64fdd2" dependencies = [ "arrow", "datafusion-common", + "itertools", "paste", ] [[package]] name = "datafusion-functions" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e4124b8066444e05a24472f852e94cf56546c0f4d92d00f018f207216902712" +checksum = "f52c4012648b34853e40a2c6bcaa8772f837831019b68aca384fb38436dba162" dependencies = [ "arrow", "arrow-buffer", @@ -936,9 +946,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94acdac235ea21810150a89751617ef2db7e32eba27f54be48a81bde2bfe119" +checksum = "e5b8bb624597ba28ed7446df4a9bd7c7a7bde7c578b6b527da3f47371d5f6741" dependencies = [ "ahash", "arrow", @@ -950,16 +960,16 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "half", + "indexmap", "log", "paste", - "sqlparser", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c9ea085bbf900bf16e2ca0f56fc56236b2e4f2e1a2cccb67bcd83c5ab4ad0ef" +checksum = "6fb06208fc470bc8cf1ce2d9a1159d42db591f2c7264a8c1776b53ad8f675143" dependencies = [ "ahash", "arrow", @@ -971,9 +981,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c882e61665ed60c5ce9b061c1e587aeb8ae5ae4bcb5e5f2465139ab25328e0f" +checksum = "fca25bbb87323716d05e54114666e942172ccca23c5a507e9c7851db6e965317" dependencies = [ "arrow", "arrow-array", @@ -994,21 +1004,34 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98a354ce96df3ca6d025093adac9fd55ca09931c9b6f2630140721a95873fde4" +checksum = "5ae23356c634e54c59f7c51acb7a5b9f6240ffb2cf997049a1a24a8a88598dbe" dependencies = [ "datafusion-common", "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-physical-expr", "datafusion-physical-expr-common", "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b3d6ff7794acea026de36007077a06b18b89e4f9c3fea7f2215f9f7dd9059b" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", ] [[package]] name = "datafusion-optimizer" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf677c74fb7b5a1899ef52709e4a70fff3ed80bdfb4bbe495909810e83d5f39" +checksum = "bec6241eb80c595fa0e1a8a6b69686b5cf3bd5fdacb8319582a0943b0bd788aa" dependencies = [ "arrow", "async-trait", @@ -1026,9 +1049,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b077999f6eb6c43d6b25bc66332a3be2f693c382840f008dd763b8540f9530" +checksum = "3370357b8fc75ec38577700644e5d1b0bc78f38babab99c0b8bd26bafb3e4335" dependencies = [ "ahash", "arrow", @@ -1037,30 +1060,26 @@ dependencies = [ "arrow-ord", "arrow-schema", "arrow-string", - "base64 0.22.1", "chrono", "datafusion-common", - "datafusion-execution", "datafusion-expr", "datafusion-expr-common", "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", - "hex", "indexmap", "itertools", "log", "paste", "petgraph", - "regex", ] [[package]] name = "datafusion-physical-expr-common" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce847f885c2b13bbe29f5c8b7948797131aa470af6e16d2a94f4428b4f4f1bd" +checksum = "b8b7734d94bf2fa6f6e570935b0ddddd8421179ce200065be97874e13d46a47b" dependencies = [ "ahash", "arrow", @@ -1072,13 +1091,15 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d13238e3b9fdd62a4c18760bfef714bb990d1e1d3430e9f416aae4b3cfaa71af" +checksum = "7eee8c479522df21d7b395640dff88c5ed05361852dce6544d7c98e9dbcebffe" dependencies = [ + "arrow", "arrow-schema", "datafusion-common", "datafusion-execution", + "datafusion-expr-common", "datafusion-physical-expr", "datafusion-physical-plan", "itertools", @@ -1086,9 +1107,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faba6f55a7eaf0241d07d12c2640de52742646b10f754485d5192bdfe2c9ceae" +checksum = "17e1fc2e2c239d14e8556f2622b19a726bf6bc6962cc00c71fc52626274bee24" dependencies = [ "ahash", "arrow", @@ -1102,8 +1123,8 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", - "datafusion-functions-aggregate", "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", "futures", @@ -1121,9 +1142,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585357d621fa03ea85a7fefca79ebc5ef0ee13a7f82be0762a414879a4d190a7" +checksum = "f730f7fc5a20134d4e5ecdf7bbf392002ac58163d58423ea28a702dc077b06e1" dependencies = [ "arrow", "chrono", @@ -1137,9 +1158,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4db6534382f92f528bdb5d925b4214c31ffd84fa7fe1eff3ed0d2f1286851ab8" +checksum = "12c225fe49e4f943e35446b263613ada7a9e9f8d647544e6b07037b9803567df" dependencies = [ "arrow", "chrono", @@ -1155,6 +1176,7 @@ dependencies = [ "arrow", "async-trait", "datafusion", + "datafusion-functions-window-common", "datafusion-proto", "datafusion-substrait", "futures", @@ -1171,15 +1193,16 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dad8d96a9b52e1aa24f9373696a815be828193efce7cb0bbd2140b6bb67d1819" +checksum = "63e3a4ed41dbee20a5d947a59ca035c225d67dc9cbe869c10f66dcdf25e7ce51" dependencies = [ "arrow", "arrow-array", "arrow-schema", "datafusion-common", "datafusion-expr", + "indexmap", "log", "regex", "sqlparser", @@ -1188,9 +1211,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f92b1b80e98bf5a9921bf118816e0e766d18527e343153321fcccfe4d68c5c45" +checksum = "8b9c768d2b4c4485c43afbaeeb86dd1f2ac3fb34a9e6e8c8b06180d2a223d5ba" dependencies = [ "arrow-buffer", "async-recursion", @@ -1530,9 +1553,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a" dependencies = [ "bytes", "futures-channel", @@ -1568,9 +1591,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41296eb09f183ac68eec06e03cdbea2e759633d4067b2f6552fc2e009bcad08b" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" dependencies = [ "bytes", "futures-channel", @@ -1684,9 +1707,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" dependencies = [ "wasm-bindgen", ] @@ -1763,9 +1786,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.159" +version = "0.2.161" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" +checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "libflate" @@ -1793,9 +1816,9 @@ dependencies = [ [[package]] name = "libm" -version = "0.2.8" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" [[package]] name = "libmimalloc-sys" @@ -2011,9 +2034,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.11.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25a0c4b3a0e31f8b66f71ad8064521efa773910196e2cde791436f13409f3b45" +checksum = "6eb4c22c6154a1e759d7099f9ffad7cc5ef8245f9efbab4a41b92623079c82f3" dependencies = [ "async-trait", "base64 0.22.1", @@ -2086,9 +2109,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.1.0" +version = "53.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "310c46a70a3ba90d98fec39fa2da6d9d731e544191da6fb56c9d199484d0dd3e" +checksum = "dea02606ba6f5e856561d8d507dba8bac060aefca2a6c0f1aa1d361fed91ff3e" dependencies = [ "ahash", "arrow-array", @@ -2228,9 +2251,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" [[package]] name = "pin-utils" @@ -2261,9 +2284,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.22" +version = "0.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba" +checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033" dependencies = [ "proc-macro2", "syn", @@ -2271,9 +2294,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" dependencies = [ "unicode-ident", ] @@ -2342,9 +2365,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00e89ce2565d6044ca31a3eb79a334c3a79a841120a98f64eea9f579564cb691" +checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51" dependencies = [ "cfg-if", "indoc", @@ -2360,9 +2383,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8afbaf3abd7325e08f35ffb8deb5892046fcb2608b703db6a583a5ba4cea01e" +checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" dependencies = [ "once_cell", "target-lexicon", @@ -2370,9 +2393,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec15a5ba277339d04763f4c23d85987a5b08cbb494860be141e6a10a8eb88022" +checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d" dependencies = [ "libc", "pyo3-build-config", @@ -2380,9 +2403,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15e0f01b5364bcfbb686a52fc4181d412b708a68ed20c330db9fc8d2c2bf5a43" +checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -2392,9 +2415,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a09b550200e1e5ed9176976d0060cbc2ea82dc8515da07885e7b8153a85caacb" +checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -2456,10 +2479,11 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.5" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fe68c2e9e1a1234e218683dbdf9f9dfcb094113c5ac2b938dfcb9bab4c4140b" +checksum = "e346e016eacfff12233c243718197ca12f148c84e1e84268a896699b41c71780" dependencies = [ + "cfg_aliases", "libc", "once_cell", "socket2", @@ -2517,9 +2541,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -2552,9 +2576,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "regress" -version = "0.9.1" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eae2a1ebfecc58aff952ef8ccd364329abe627762f5bf09ff42eb9d98522479" +checksum = "1541daf4e4ed43a0922b7969bdc2170178bcacc5dabf7e39bc508a9fa3953a7a" dependencies = [ "hashbrown 0.14.5", "memchr", @@ -2562,9 +2586,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.8" +version = "0.12.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" +checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" dependencies = [ "base64 0.22.1", "bytes", @@ -2649,9 +2673,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.37" +version = "0.38.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" +checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" dependencies = [ "bitflags 2.6.0", "errno", @@ -2662,9 +2686,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.14" +version = "0.23.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" +checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" dependencies = [ "once_cell", "ring", @@ -2698,9 +2722,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" +checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" [[package]] name = "rustls-webpki" @@ -2715,9 +2739,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" +checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" [[package]] name = "ryu" @@ -2813,18 +2837,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.210" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.210" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" dependencies = [ "proc-macro2", "quote", @@ -2844,9 +2868,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.128" +version = "1.0.132" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" dependencies = [ "itoa", "memchr", @@ -2974,9 +2998,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.50.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2e5b515a2bd5168426033e9efbfd05500114833916f1d5c268f938b4ee130ac" +checksum = "5fe11944a61da0da3f592e19a45ebe5ab92dc14a779907ff1f08fbb797bfefc7" dependencies = [ "log", "sqlparser_derive", @@ -3042,9 +3066,9 @@ dependencies = [ [[package]] name = "substrait" -version = "0.41.9" +version = "0.45.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a3bf05f1d7a3fd7a97790d410f6e859b3a98dcde05e7a3fc00b31b0f60fe7cb" +checksum = "a127ae9d8e443cea5c2122eb2ffe5fe489e802a1e746a09c5a5cb59d074c0aeb" dependencies = [ "heck 0.5.0", "pbjson", @@ -3055,6 +3079,7 @@ dependencies = [ "prost-build", "prost-types", "protobuf-src", + "regress", "schemars", "semver", "serde", @@ -3073,9 +3098,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.79" +version = "2.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" dependencies = [ "proc-macro2", "quote", @@ -3112,18 +3137,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.64" +version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" +checksum = "5d11abd9594d9b38965ef50805c5e469ca9cc6f197f883f717e0269a3057b3d5" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.64" +version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" +checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602" dependencies = [ "proc-macro2", "quote", @@ -3167,9 +3192,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.40.0" +version = "1.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" +checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" dependencies = [ "backtrace", "bytes", @@ -3297,9 +3322,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "typify" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb6beec125971dda80a086f90b4a70f60f222990ce4d63ad0fc140492f53444" +checksum = "b4c644dda9862f0fef3a570d8ddb3c2cfb1d5ac824a1f2ddfa7bc8f071a5ad8a" dependencies = [ "typify-impl", "typify-macro", @@ -3307,9 +3332,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93bbb24e990654aff858d80fee8114f4322f7d7a1b1ecb45129e2fcb0d0ad5ae" +checksum = "d59ab345b6c0d8ae9500b9ff334a4c7c0d316c1c628dc55726b95887eb8dbd11" dependencies = [ "heck 0.5.0", "log", @@ -3327,9 +3352,9 @@ dependencies = [ [[package]] name = "typify-macro" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8e6491896e955692d68361c68db2b263e3bec317ec0b684e0e2fa882fb6e31e" +checksum = "785e2cdcef0df8160fdd762ed548a637aaec1e83704fdbc14da0df66013ee8d0" dependencies = [ "proc-macro2", "quote", @@ -3447,9 +3472,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" dependencies = [ "cfg-if", "once_cell", @@ -3458,9 +3483,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" dependencies = [ "bumpalo", "log", @@ -3473,9 +3498,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.43" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" +checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" dependencies = [ "cfg-if", "js-sys", @@ -3485,9 +3510,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3495,9 +3520,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", @@ -3508,15 +3533,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] name = "wasm-streams" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e072d4e72f700fb3443d8fe94a39315df013eef1104903cdb0a2abd322bbecd" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" dependencies = [ "futures-util", "js-sys", @@ -3527,9 +3552,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 073f82cf0..11ce08c75 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,9 +37,10 @@ substrait = ["dep:datafusion-substrait"] tokio = { version = "1.39", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } arrow = { version = "53", features = ["pyarrow"] } -datafusion = { version = "42.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } -datafusion-substrait = { version = "42.0.0", optional = true } -datafusion-proto = { version = "42.0.0" } +datafusion = { version = "43.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } +datafusion-substrait = { version = "43.0.0", optional = true } +datafusion-proto = { version = "43.0.0" } +datafusion-functions-window-common = { version = "43.0.0" } prost = "0.13" # keep in line with `datafusion-substrait` uuid = { version = "1.11", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } @@ -58,4 +59,4 @@ crate-type = ["cdylib", "rlib"] [profile.release] lto = true -codegen-units = 1 +codegen-units = 1 \ No newline at end of file diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index 13144ae9d..3ce9cdfe5 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -25,7 +25,7 @@ def df_selection(col_name, col_type): if col_type == pa.float64() or isinstance(col_type, pa.Decimal128Type): return F.round(col(col_name), lit(2)).alias(col_name) - elif col_type == pa.string(): + elif col_type == pa.string() or col_type == pa.string_view(): return F.trim(col(col_name)).alias(col_name) else: return col(col_name) @@ -43,7 +43,7 @@ def load_schema(col_name, col_type): def expected_selection(col_name, col_type): if col_type == pa.int64() or col_type == pa.int32(): return F.trim(col(col_name)).cast(col_type).alias(col_name) - elif col_type == pa.string(): + elif col_type == pa.string() or col_type == pa.string_view(): return F.trim(col(col_name)).alias(col_name) else: return col(col_name) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index c4e7713f3..b10724381 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -51,7 +51,6 @@ Column = expr_internal.Column CreateMemoryTable = expr_internal.CreateMemoryTable CreateView = expr_internal.CreateView -CrossJoin = expr_internal.CrossJoin Distinct = expr_internal.Distinct DropTable = expr_internal.DropTable EmptyRelation = expr_internal.EmptyRelation @@ -140,7 +139,6 @@ "Join", "JoinType", "JoinConstraint", - "CrossJoin", "Union", "Unnest", "UnnestExpr", @@ -376,6 +374,8 @@ def literal(value: Any) -> Expr: ``value`` must be a valid PyArrow scalar value or easily castable to one. """ + if isinstance(value, str): + value = pa.scalar(value, type=pa.string_view()) if not isinstance(value, pa.Scalar): value = pa.scalar(value) return Expr(expr_internal.Expr.literal(value)) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 907f801af..5a2eab56d 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -297,7 +297,7 @@ def decode(input: Expr, encoding: Expr) -> Expr: def array_to_string(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation.""" - return Expr(f.array_to_string(expr.expr, delimiter.expr)) + return Expr(f.array_to_string(expr.expr, delimiter.expr.cast(pa.string()))) def array_join(expr: Expr, delimiter: Expr) -> Expr: @@ -1067,7 +1067,10 @@ def struct(*args: Expr) -> Expr: def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr: """Returns a struct with the given names and arguments pairs.""" - name_pair_exprs = [[Expr.literal(pair[0]), pair[1]] for pair in name_pairs] + name_pair_exprs = [ + [Expr.literal(pa.scalar(pair[0], type=pa.string())), pair[1]] + for pair in name_pairs + ] # flatten name_pairs = [x.expr for xs in name_pair_exprs for x in xs] @@ -1424,7 +1427,9 @@ def array_sort(array: Expr, descending: bool = False, null_first: bool = False) nulls_first = "NULLS FIRST" if null_first else "NULLS LAST" return Expr( f.array_sort( - array.expr, Expr.literal(desc).expr, Expr.literal(nulls_first).expr + array.expr, + Expr.literal(pa.scalar(desc, type=pa.string())).expr, + Expr.literal(pa.scalar(nulls_first, type=pa.string())).expr, ) ) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 291ef2bae..d9d994b22 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -229,6 +229,7 @@ def udaf( which this UDAF is used. The following examples are all valid. .. code-block:: python + import pyarrow as pa import pyarrow.compute as pc diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 1847edef2..77f88aa44 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -85,14 +85,18 @@ def test_limit(test_ctx): plan = plan.to_variant() assert isinstance(plan, Limit) - assert plan.skip() == 0 + # TODO: Upstream now has expressions for skip and fetch + # REF: https://github.com/apache/datafusion/pull/12836 + # assert plan.skip() == 0 df = test_ctx.sql("select c1 from test LIMIT 10 OFFSET 5") plan = df.logical_plan() plan = plan.to_variant() assert isinstance(plan, Limit) - assert plan.skip() == 5 + # TODO: Upstream now has expressions for skip and fetch + # REF: https://github.com/apache/datafusion/pull/12836 + # assert plan.skip() == 5 def test_aggregate_query(test_ctx): @@ -126,7 +130,10 @@ def test_relational_expr(test_ctx): ctx = SessionContext() batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array(["alpha", "beta", "gamma"])], + [ + pa.array([1, 2, 3]), + pa.array(["alpha", "beta", "gamma"], type=pa.string_view()), + ], names=["a", "b"], ) df = ctx.create_dataframe([[batch]], name="batch_array") @@ -141,7 +148,8 @@ def test_relational_expr(test_ctx): assert df.filter(col("b") == "beta").count() == 1 assert df.filter(col("b") != "beta").count() == 2 - assert df.filter(col("a") == "beta").count() == 0 + with pytest.raises(Exception): + df.filter(col("a") == "beta").count() def test_expr_to_variant(): diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index c65c633a4..b3a5a0652 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -34,9 +34,9 @@ def df(): # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays( [ - pa.array(["Hello", "World", "!"]), + pa.array(["Hello", "World", "!"], type=pa.string_view()), pa.array([4, 5, 6]), - pa.array(["hello ", " world ", " !"]), + pa.array(["hello ", " world ", " !"], type=pa.string_view()), pa.array( [ datetime(2022, 12, 31), @@ -88,8 +88,8 @@ def test_literal(df): assert len(result) == 1 result = result[0] assert result.column(0) == pa.array([1] * 3) - assert result.column(1) == pa.array(["1"] * 3) - assert result.column(2) == pa.array(["OK"] * 3) + assert result.column(1) == pa.array(["1"] * 3, type=pa.string_view()) + assert result.column(2) == pa.array(["OK"] * 3, type=pa.string_view()) assert result.column(3) == pa.array([3.14] * 3) assert result.column(4) == pa.array([True] * 3) assert result.column(5) == pa.array([b"hello world"] * 3) @@ -97,7 +97,9 @@ def test_literal(df): def test_lit_arith(df): """Test literals with arithmetic operations""" - df = df.select(literal(1) + column("b"), f.concat(column("a"), literal("!"))) + df = df.select( + literal(1) + column("b"), f.concat(column("a").cast(pa.string()), literal("!")) + ) result = df.collect() assert len(result) == 1 result = result[0] @@ -600,21 +602,33 @@ def test_array_function_obj_tests(stmt, py_expr): f.ascii(column("a")), pa.array([72, 87, 33], type=pa.int32()), ), # H = 72; W = 87; ! = 33 - (f.bit_length(column("a")), pa.array([40, 40, 8], type=pa.int32())), - (f.btrim(literal(" World ")), pa.array(["World", "World", "World"])), + ( + f.bit_length(column("a").cast(pa.string())), + pa.array([40, 40, 8], type=pa.int32()), + ), + ( + f.btrim(literal(" World ")), + pa.array(["World", "World", "World"], type=pa.string_view()), + ), (f.character_length(column("a")), pa.array([5, 5, 1], type=pa.int32())), (f.chr(literal(68)), pa.array(["D", "D", "D"])), ( f.concat_ws("-", column("a"), literal("test")), pa.array(["Hello-test", "World-test", "!-test"]), ), - (f.concat(column("a"), literal("?")), pa.array(["Hello?", "World?", "!?"])), + ( + f.concat(column("a").cast(pa.string()), literal("?")), + pa.array(["Hello?", "World?", "!?"]), + ), (f.initcap(column("c")), pa.array(["Hello ", " World ", " !"])), (f.left(column("a"), literal(3)), pa.array(["Hel", "Wor", "!"])), (f.length(column("c")), pa.array([6, 7, 2], type=pa.int32())), (f.lower(column("a")), pa.array(["hello", "world", "!"])), (f.lpad(column("a"), literal(7)), pa.array([" Hello", " World", " !"])), - (f.ltrim(column("c")), pa.array(["hello ", "world ", "!"])), + ( + f.ltrim(column("c")), + pa.array(["hello ", "world ", "!"], type=pa.string_view()), + ), ( f.md5(column("a")), pa.array( @@ -640,19 +654,25 @@ def test_array_function_obj_tests(stmt, py_expr): f.rpad(column("a"), literal(8)), pa.array(["Hello ", "World ", "! "]), ), - (f.rtrim(column("c")), pa.array(["hello", " world", " !"])), + ( + f.rtrim(column("c")), + pa.array(["hello", " world", " !"], type=pa.string_view()), + ), ( f.split_part(column("a"), literal("l"), literal(1)), pa.array(["He", "Wor", "!"]), ), (f.starts_with(column("a"), literal("Wor")), pa.array([False, True, False])), (f.strpos(column("a"), literal("o")), pa.array([5, 2, 0], type=pa.int32())), - (f.substr(column("a"), literal(3)), pa.array(["llo", "rld", ""])), + ( + f.substr(column("a"), literal(3)), + pa.array(["llo", "rld", ""], type=pa.string_view()), + ), ( f.translate(column("a"), literal("or"), literal("ld")), pa.array(["Helll", "Wldld", "!"]), ), - (f.trim(column("c")), pa.array(["hello", "world", "!"])), + (f.trim(column("c")), pa.array(["hello", "world", "!"], type=pa.string_view())), (f.upper(column("c")), pa.array(["HELLO ", " WORLD ", " !"])), (f.ends_with(column("a"), literal("llo")), pa.array([True, False, False])), ( @@ -794,9 +814,9 @@ def test_temporal_functions(df): f.date_trunc(literal("month"), column("d")), f.datetrunc(literal("day"), column("d")), f.date_bin( - literal("15 minutes"), + literal("15 minutes").cast(pa.string()), column("d"), - literal("2001-01-01 00:02:30"), + literal("2001-01-01 00:02:30").cast(pa.string()), ), f.from_unixtime(literal(1673383974)), f.to_timestamp(literal("2023-09-07 05:06:14.523952")), @@ -858,8 +878,8 @@ def test_case(df): result = df.collect() result = result[0] assert result.column(0) == pa.array([10, 8, 8]) - assert result.column(1) == pa.array(["Hola", "Mundo", "!!"]) - assert result.column(2) == pa.array(["Hola", "Mundo", None]) + assert result.column(1) == pa.array(["Hola", "Mundo", "!!"], type=pa.string_view()) + assert result.column(2) == pa.array(["Hola", "Mundo", None], type=pa.string_view()) def test_when_with_no_base(df): @@ -877,8 +897,10 @@ def test_when_with_no_base(df): result = df.collect() result = result[0] assert result.column(0) == pa.array([4, 5, 6]) - assert result.column(1) == pa.array(["too small", "just right", "too big"]) - assert result.column(2) == pa.array(["Hello", None, None]) + assert result.column(1) == pa.array( + ["too small", "just right", "too big"], type=pa.string_view() + ) + assert result.column(2) == pa.array(["Hello", None, None], type=pa.string_view()) def test_regr_funcs_sql(df): @@ -1021,8 +1043,13 @@ def test_regr_funcs_df(func, expected): def test_binary_string_functions(df): df = df.select( - f.encode(column("a"), literal("base64")), - f.decode(f.encode(column("a"), literal("base64")), literal("base64")), + f.encode(column("a").cast(pa.string()), literal("base64").cast(pa.string())), + f.decode( + f.encode( + column("a").cast(pa.string()), literal("base64").cast(pa.string()) + ), + literal("base64").cast(pa.string()), + ), ) result = df.collect() assert len(result) == 1 diff --git a/python/tests/test_imports.py b/python/tests/test_imports.py index 3d324fb62..6ea77b15f 100644 --- a/python/tests/test_imports.py +++ b/python/tests/test_imports.py @@ -46,7 +46,6 @@ Join, JoinType, JoinConstraint, - CrossJoin, Union, Like, ILike, @@ -129,7 +128,6 @@ def test_class_module_is_datafusion(): Join, JoinType, JoinConstraint, - CrossJoin, Union, Like, ILike, diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 39e5ffe6d..a2521dd09 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -468,6 +468,13 @@ def test_simple_select(ctx, tmp_path, arr): batches = ctx.sql("SELECT a AS tt FROM t").collect() result = batches[0].column(0) + # In DF 43.0.0 we now default to having BinaryView and StringView + # so the array that is saved to the parquet is slightly different + # than the array read. Convert to values for comparison. + if isinstance(result, pa.BinaryViewArray) or isinstance(result, pa.StringViewArray): + arr = arr.tolist() + result = result.tolist() + np.testing.assert_equal(result, arr) diff --git a/src/context.rs b/src/context.rs index f445874d6..c2a263fa7 100644 --- a/src/context.rs +++ b/src/context.rs @@ -287,7 +287,7 @@ impl PySessionContext { } else { RuntimeConfig::default() }; - let runtime = Arc::new(RuntimeEnv::new(runtime_config)?); + let runtime = Arc::new(RuntimeEnv::try_new(runtime_config)?); let session_state = SessionStateBuilder::new() .with_config(config) .with_runtime_env(runtime) diff --git a/src/dataframe.rs b/src/dataframe.rs index ee8fbbf9d..e7d6ca6d6 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -402,7 +402,9 @@ impl PyDataFrame { #[pyo3(signature = (column, preserve_nulls=true))] fn unnest_column(&self, column: &str, preserve_nulls: bool) -> PyResult { - let unnest_options = UnnestOptions { preserve_nulls }; + // TODO: expose RecursionUnnestOptions + // REF: https://github.com/apache/datafusion/pull/11577 + let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls); let df = self .df .as_ref() @@ -413,7 +415,9 @@ impl PyDataFrame { #[pyo3(signature = (columns, preserve_nulls=true))] fn unnest_columns(&self, columns: Vec, preserve_nulls: bool) -> PyResult { - let unnest_options = UnnestOptions { preserve_nulls }; + // TODO: expose RecursionUnnestOptions + // REF: https://github.com/apache/datafusion/pull/11577 + let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls); let cols = columns.iter().map(|s| s.as_ref()).collect::>(); let df = self .df diff --git a/src/expr.rs b/src/expr.rs index 49fa4b845..bca0cd3fa 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -65,7 +65,6 @@ pub mod column; pub mod conditional_expr; pub mod create_memory_table; pub mod create_view; -pub mod cross_join; pub mod distinct; pub mod drop_table; pub mod empty_relation; @@ -775,7 +774,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/src/expr/cross_join.rs b/src/expr/cross_join.rs deleted file mode 100644 index 5bc202aac..000000000 --- a/src/expr/cross_join.rs +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::logical_expr::logical_plan::CrossJoin; -use pyo3::prelude::*; -use std::fmt::{self, Display, Formatter}; - -use super::logical_node::LogicalNode; -use crate::common::df_schema::PyDFSchema; -use crate::sql::logical::PyLogicalPlan; - -#[pyclass(name = "CrossJoin", module = "datafusion.expr", subclass)] -#[derive(Clone)] -pub struct PyCrossJoin { - cross_join: CrossJoin, -} - -impl From for PyCrossJoin { - fn from(cross_join: CrossJoin) -> PyCrossJoin { - PyCrossJoin { cross_join } - } -} - -impl From for CrossJoin { - fn from(cross_join: PyCrossJoin) -> Self { - cross_join.cross_join - } -} - -impl Display for PyCrossJoin { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!( - f, - "CrossJoin - \nLeft: {:?} - \nRight: {:?} - \nSchema: {:?}", - &self.cross_join.left, &self.cross_join.right, &self.cross_join.schema - ) - } -} - -#[pymethods] -impl PyCrossJoin { - /// Retrieves the left input `LogicalPlan` to this `CrossJoin` node - fn left(&self) -> PyResult { - Ok(self.cross_join.left.as_ref().clone().into()) - } - - /// Retrieves the right input `LogicalPlan` to this `CrossJoin` node - fn right(&self) -> PyResult { - Ok(self.cross_join.right.as_ref().clone().into()) - } - - /// Resulting Schema for this `CrossJoin` node instance - fn schema(&self) -> PyResult { - Ok(self.cross_join.schema.as_ref().clone().into()) - } - - fn __repr__(&self) -> PyResult { - Ok(format!("CrossJoin({})", self)) - } - - fn __name__(&self) -> PyResult { - Ok("CrossJoin".to_string()) - } -} - -impl LogicalNode for PyCrossJoin { - fn inputs(&self) -> Vec { - vec![ - PyLogicalPlan::from((*self.cross_join.left).clone()), - PyLogicalPlan::from((*self.cross_join.right).clone()), - ] - } - - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) - } -} diff --git a/src/expr/limit.rs b/src/expr/limit.rs index 876e154c1..84ad7d68b 100644 --- a/src/expr/limit.rs +++ b/src/expr/limit.rs @@ -46,7 +46,7 @@ impl Display for PyLimit { write!( f, "Limit - Skip: {} + Skip: {:?} Fetch: {:?} Input: {:?}", &self.limit.skip, &self.limit.fetch, &self.limit.input @@ -56,15 +56,19 @@ impl Display for PyLimit { #[pymethods] impl PyLimit { - /// Retrieves the skip value for this `Limit` - fn skip(&self) -> usize { - self.limit.skip - } + // NOTE: Upstream now has expressions for skip and fetch + // TODO: Do we still want to expose these? + // REF: https://github.com/apache/datafusion/pull/12836 - /// Retrieves the fetch value for this `Limit` - fn fetch(&self) -> Option { - self.limit.fetch - } + // /// Retrieves the skip value for this `Limit` + // fn skip(&self) -> usize { + // self.limit.skip + // } + + // /// Retrieves the fetch value for this `Limit` + // fn fetch(&self) -> Option { + // self.limit.fetch + // } /// Retrieves the input `LogicalPlan` to this `Limit` node fn input(&self) -> PyResult> { diff --git a/src/functions.rs b/src/functions.rs index fe3531ba9..e29c57f9b 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -16,7 +16,6 @@ // under the License. use datafusion::functions_aggregate::all_default_aggregate_functions; -use datafusion::logical_expr::window_function; use datafusion::logical_expr::ExprFunctionExt; use datafusion::logical_expr::WindowFrame; use pyo3::{prelude::*, wrap_pyfunction}; @@ -33,6 +32,7 @@ use datafusion::common::{Column, ScalarValue, TableReference}; use datafusion::execution::FunctionRegistry; use datafusion::functions; use datafusion::functions_aggregate; +use datafusion::functions_window; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::sqlparser::ast::NullTreatment as DFNullTreatment; use datafusion::logical_expr::{ @@ -758,7 +758,7 @@ pub fn lead( partition_by: Option>, order_by: Option>, ) -> PyResult { - let window_fn = window_function::lead(arg.expr, Some(shift_offset), default_value); + let window_fn = functions_window::expr_fn::lead(arg.expr, Some(shift_offset), default_value); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } @@ -772,7 +772,7 @@ pub fn lag( partition_by: Option>, order_by: Option>, ) -> PyResult { - let window_fn = window_function::lag(arg.expr, Some(shift_offset), default_value); + let window_fn = functions_window::expr_fn::lag(arg.expr, Some(shift_offset), default_value); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } @@ -783,7 +783,7 @@ pub fn row_number( partition_by: Option>, order_by: Option>, ) -> PyResult { - let window_fn = datafusion::functions_window::expr_fn::row_number(); + let window_fn = functions_window::expr_fn::row_number(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } @@ -794,7 +794,7 @@ pub fn rank( partition_by: Option>, order_by: Option>, ) -> PyResult { - let window_fn = window_function::rank(); + let window_fn = functions_window::expr_fn::rank(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } @@ -805,7 +805,7 @@ pub fn dense_rank( partition_by: Option>, order_by: Option>, ) -> PyResult { - let window_fn = window_function::dense_rank(); + let window_fn = functions_window::expr_fn::dense_rank(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } @@ -816,7 +816,7 @@ pub fn percent_rank( partition_by: Option>, order_by: Option>, ) -> PyResult { - let window_fn = window_function::percent_rank(); + let window_fn = functions_window::expr_fn::percent_rank(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } @@ -827,7 +827,7 @@ pub fn cume_dist( partition_by: Option>, order_by: Option>, ) -> PyResult { - let window_fn = window_function::cume_dist(); + let window_fn = functions_window::expr_fn::cume_dist(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } @@ -839,7 +839,7 @@ pub fn ntile( partition_by: Option>, order_by: Option>, ) -> PyResult { - let window_fn = window_function::ntile(arg.into()); + let window_fn = functions_window::expr_fn::ntile(arg.into()); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) } diff --git a/src/sql/logical.rs b/src/sql/logical.rs index fc398ff89..40f0a6a65 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -19,7 +19,6 @@ use std::sync::Arc; use crate::expr::aggregate::PyAggregate; use crate::expr::analyze::PyAnalyze; -use crate::expr::cross_join::PyCrossJoin; use crate::expr::distinct::PyDistinct; use crate::expr::empty_relation::PyEmptyRelation; use crate::expr::explain::PyExplain; @@ -68,7 +67,6 @@ impl PyLogicalPlan { match self.plan.as_ref() { LogicalPlan::Aggregate(plan) => PyAggregate::from(plan.clone()).to_variant(py), LogicalPlan::Analyze(plan) => PyAnalyze::from(plan.clone()).to_variant(py), - LogicalPlan::CrossJoin(plan) => PyCrossJoin::from(plan.clone()).to_variant(py), LogicalPlan::Distinct(plan) => PyDistinct::from(plan.clone()).to_variant(py), LogicalPlan::EmptyRelation(plan) => PyEmptyRelation::from(plan.clone()).to_variant(py), LogicalPlan::Explain(plan) => PyExplain::from(plan.clone()).to_variant(py), @@ -92,6 +90,7 @@ impl PyLogicalPlan { | LogicalPlan::Ddl(_) | LogicalPlan::Copy(_) | LogicalPlan::DescribeTable(_) + | LogicalPlan::Execute(_) | LogicalPlan::RecursiveQuery(_) => Err(py_unsupported_variant_err(format!( "Conversion of variant not implemented: {:?}", self.plan diff --git a/src/udf.rs b/src/udf.rs index ec8efb169..4570e77a6 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -31,7 +31,7 @@ use datafusion::logical_expr::{create_udf, ColumnarValue}; use crate::expr::PyExpr; use crate::utils::parse_volatility; -/// Create a Rust callable function fr a python function that expects pyarrow arrays +/// Create a Rust callable function from a python function that expects pyarrow arrays fn pyarrow_function_to_rust( func: PyObject, ) -> impl Fn(&[ArrayRef]) -> Result { @@ -97,7 +97,7 @@ impl PyScalarUDF { let function = create_udf( name, input_types.0, - Arc::new(return_type.0), + return_type.0, parse_volatility(volatility)?, to_scalar_function_impl(func), ); diff --git a/src/udwf.rs b/src/udwf.rs index 43c21ec7b..3f5ad0b1d 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -20,11 +20,16 @@ use std::ops::Range; use std::sync::Arc; use arrow::array::{make_array, Array, ArrayData, ArrayRef}; +use datafusion::logical_expr::function::{PartitionEvaluatorArgs, WindowUDFFieldArgs}; use datafusion::logical_expr::window_state::WindowAggState; +use datafusion::physical_plan::PhysicalExpr; use datafusion::scalar::ScalarValue; +use datafusion_functions_window_common::expr::ExpressionArgs; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; +use crate::expr::PyExpr; +use crate::utils::parse_volatility; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow}; use datafusion::error::{DataFusionError, Result}; @@ -33,9 +38,6 @@ use datafusion::logical_expr::{ }; use pyo3::types::{PyList, PyTuple}; -use crate::expr::PyExpr; -use crate::utils::parse_volatility; - #[derive(Debug)] struct RustPartitionEvaluator { evaluator: PyObject, @@ -90,6 +92,7 @@ impl PartitionEvaluator for RustPartitionEvaluator { } fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result { + println!("evaluate all called with number of values {}", values.len()); Python::with_gil(|py| { let py_values = PyList::new_bound( py, @@ -299,11 +302,25 @@ impl WindowUDFImpl for MultiColumnWindowUDF { &self.signature } - fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(self.return_type.clone()) + fn field(&self, field_args: WindowUDFFieldArgs) -> Result { + // TODO: Should nullable always be `true`? + Ok(arrow::datatypes::Field::new( + field_args.name(), + self.return_type.clone(), + true, + )) } - fn partition_evaluator(&self) -> Result> { + // TODO: Enable passing partition_evaluator_args to python? + fn partition_evaluator( + &self, + _partition_evaluator_args: PartitionEvaluatorArgs, + ) -> Result> { + let _ = _partition_evaluator_args; (self.partition_evaluator_factory)() } + + fn expressions(&self, expr_args: ExpressionArgs) -> Vec> { + expr_args.input_exprs().into() + } } From e3e55b7cb70ac27e209edfcad2e008f685687e90 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 10:43:22 -0600 Subject: [PATCH 073/248] build(deps): bump tokio from 1.40.0 to 1.41.1 (#946) Bumps [tokio](https://github.com/tokio-rs/tokio) from 1.40.0 to 1.41.1. - [Release notes](https://github.com/tokio-rs/tokio/releases) - [Commits](https://github.com/tokio-rs/tokio/compare/tokio-1.40.0...tokio-1.41.1) --- updated-dependencies: - dependency-name: tokio dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 497c5b850..f483a6a2a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3192,9 +3192,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.41.0" +version = "1.41.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" +checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" dependencies = [ "backtrace", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 11ce08c75..d86948b3a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,7 @@ protoc = [ "datafusion-substrait/protoc" ] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.39", features = ["macros", "rt", "rt-multi-thread", "sync"] } +tokio = { version = "1.41", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } arrow = { version = "53", features = ["pyarrow"] } datafusion = { version = "43.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } From 53cdb11637ac1945bdb8291a72bf4c7fbad95c49 Mon Sep 17 00:00:00 2001 From: kosiew Date: Tue, 12 Nov 2024 19:20:55 +0800 Subject: [PATCH 074/248] Add list_cat, list_concat, list_repeat (#942) * Add list_cat, list_concat * Add list_repeat * docs: add examples for list_cat, list_concat, and list_repeat functions * Amend list_repeat code example - literal * Amend list_ to array_ in documentation --- .../common-operations/expressions.rst | 29 +++++++++++++++++++ python/datafusion/functions.py | 27 +++++++++++++++++ python/tests/test_functions.py | 12 ++++++++ 3 files changed, 68 insertions(+) diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index b2a83c89f..e94e1a6b5 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -110,6 +110,35 @@ This function returns an integer indicating the total number of elements in the In this example, the `num_elements` column will contain `3` for both rows. +To concatenate two arrays, you can use the function :py:func:`datafusion.functions.array_cat` or :py:func:`datafusion.functions.array_concat`. +These functions return a new array that is the concatenation of the input arrays. + +.. ipython:: python + + from datafusion import SessionContext, col + from datafusion.functions import array_cat, array_concat + + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[4, 5, 6]]}) + df.select(array_cat(col("a"), col("b")).alias("concatenated_array")) + +In this example, the `concatenated_array` column will contain `[1, 2, 3, 4, 5, 6]`. + +To repeat the elements of an array a specified number of times, you can use the function :py:func:`datafusion.functions.array_repeat`. +This function returns a new array with the elements repeated. + +.. ipython:: python + + from datafusion import SessionContext, col, literal + from datafusion.functions import array_repeat + + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + df.select(array_repeat(col("a"), literal(2)).alias("repeated_array")) + +In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`. + + Structs ------- diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 5a2eab56d..88ea7280d 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -147,6 +147,8 @@ "length", "levenshtein", "list_append", + "list_cat", + "list_concat", "list_dims", "list_distinct", "list_element", @@ -162,6 +164,7 @@ "list_prepend", "list_push_back", "list_push_front", + "list_repeat", "list_remove", "list_remove_all", "list_remove_n", @@ -1145,6 +1148,22 @@ def array_distinct(array: Expr) -> Expr: return Expr(f.array_distinct(array.expr)) +def list_cat(*args: Expr) -> Expr: + """Concatenates the input arrays. + + This is an alias for :py:func:`array_concat`, :py:func:`array_cat`. + """ + return array_concat(*args) + + +def list_concat(*args: Expr) -> Expr: + """Concatenates the input arrays. + + This is an alias for :py:func:`array_concat`, :py:func:`array_cat`. + """ + return array_concat(*args) + + def list_distinct(array: Expr) -> Expr: """Returns distinct values from the array after removing duplicates. @@ -1369,6 +1388,14 @@ def array_repeat(element: Expr, count: Expr) -> Expr: return Expr(f.array_repeat(element.expr, count.expr)) +def list_repeat(element: Expr, count: Expr) -> Expr: + """Returns an array containing ``element`` ``count`` times. + + This is an alias for :py:func:`array_repeat`. + """ + return array_repeat(element, count) + + def array_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces the first occurrence of ``from_val`` with ``to_val``.""" return Expr(f.array_replace(array.expr, from_val.expr, to_val.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index b3a5a0652..c14cfc2dc 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -291,6 +291,14 @@ def py_flatten(arr): lambda col: f.array_cat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], ], + [ + lambda col: f.list_cat(col, col), + lambda data: [np.concatenate([arr, arr]) for arr in data], + ], + [ + lambda col: f.list_concat(col, col), + lambda data: [np.concatenate([arr, arr]) for arr in data], + ], [ lambda col: f.array_dims(col), lambda data: [[len(r)] for r in data], @@ -439,6 +447,10 @@ def py_flatten(arr): lambda col: f.array_repeat(col, literal(2)), lambda data: [[arr] * 2 for arr in data], ], + [ + lambda col: f.list_repeat(col, literal(2)), + lambda data: [[arr] * 2 for arr in data], + ], [ lambda col: f.array_replace(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], From 5e32ada2565cebec3df54a1bbf9725f3a434b24d Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 15 Nov 2024 11:23:27 -0500 Subject: [PATCH 075/248] Add foreign table providers (#921) * testing FFI for table provider * Was able to get round trip schema from datafusion -> delta table -> datafusion * Expand file structure * WIP on execution plan * Working through execution plan FFI * Using datafusion-proto for execution plan properties * Adding plan properties parsing from ffi * Standardize naming for FFI structs * Intermediate testing and troubleshooting * Adding record batch stream ffi representation * Mimimum viable product demonstrating foreign table provider * Move ffi module to datafusion core * Modifications need to compile against latest DF * Set DF to 42.0.0 * Rebasing and pulling in a few changes for DF43.0 * Add wrapper for register table provider * Suppress deprecation warning * Add example for FFI table provider * Add pytest for FFI module to CI * Add license text * Change the name of the FFI table provider test so it doesn't try to run during the first pass of pytest when the module hasn't been built * Build example in build stage to be used during test stage * Combine pytests into one stage * Fix path for unit test * Installing maturin for ffi test in test script * Need to install the wheel for unit test * Add online documentation about using custom table providers * Raise an error if method is not implemented when it is expected --- .github/workflows/build.yml | 4 +- .github/workflows/test.yaml | 9 + Cargo.lock | 627 +++- Cargo.toml | 1 + docs/source/user-guide/io/index.rst | 1 + docs/source/user-guide/io/table_provider.rst | 56 + .../ffi-table-provider/.cargo/config.toml | 12 + examples/ffi-table-provider/Cargo.lock | 3175 +++++++++++++++++ examples/ffi-table-provider/Cargo.toml | 36 + examples/ffi-table-provider/build.rs | 20 + examples/ffi-table-provider/pyproject.toml | 33 + .../python/tests/_test_table_provider.py | 40 + examples/ffi-table-provider/src/lib.rs | 115 + python/datafusion/context.py | 8 + python/tests/test_dataframe.py | 1 + src/context.rs | 29 +- 16 files changed, 4078 insertions(+), 89 deletions(-) create mode 100644 docs/source/user-guide/io/table_provider.rst create mode 100644 examples/ffi-table-provider/.cargo/config.toml create mode 100644 examples/ffi-table-provider/Cargo.lock create mode 100644 examples/ffi-table-provider/Cargo.toml create mode 100644 examples/ffi-table-provider/build.rs create mode 100644 examples/ffi-table-provider/pyproject.toml create mode 100644 examples/ffi-table-provider/python/tests/_test_table_provider.py create mode 100644 examples/ffi-table-provider/src/lib.rs diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f52913ce8..084a96192 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -156,7 +156,9 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Build Python package - run: maturin build --release --strip --features substrait + run: | + maturin build --release --strip --features substrait + - name: List Mac wheels run: find target/wheels/ diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index f9383db5f..21faedecd 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -110,6 +110,15 @@ jobs: pip install -e . -vv pytest -v . + - name: FFI unit tests + run: | + source venv/bin/activate + pip install -e . -vv + pip install maturin==1.5.1 + cd examples/ffi-table-provider + maturin develop --release --strip + pytest python/tests/_test_table_provider.py + - name: Cache the generated dataset id: cache-tpch-dataset uses: actions/cache@v4 diff --git a/Cargo.lock b/Cargo.lock index f483a6a2a..7b57b330a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,54 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "abi_stable" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" +dependencies = [ + "abi_stable_derive", + "abi_stable_shared", + "const_panic", + "core_extensions", + "crossbeam-channel", + "generational-arena", + "libloading", + "lock_api", + "parking_lot", + "paste", + "repr_offset", + "rustc_version", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "abi_stable_derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" +dependencies = [ + "abi_stable_shared", + "as_derive_utils", + "core_extensions", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", + "typed-arena", +] + +[[package]] +name = "abi_stable_shared" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" +dependencies = [ + "core_extensions", +] + [[package]] name = "addr2line" version = "0.24.2" @@ -63,9 +111,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +checksum = "611cc2ae7d2e242c457e4be7f97036b8ad9ca152b499f53faf99b1ed8fc2553f" [[package]] name = "android-tzdata" @@ -84,9 +132,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.91" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" [[package]] name = "apache-avro" @@ -349,6 +397,18 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "as_derive_utils" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" +dependencies = [ + "core_extensions", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "async-compression" version = "0.4.17" @@ -367,6 +427,15 @@ dependencies = [ "zstd-safe 7.2.1", ] +[[package]] +name = "async-ffi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" +dependencies = [ + "abi_stable", +] + [[package]] name = "async-recursion" version = "1.1.1" @@ -375,7 +444,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -386,7 +455,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -542,9 +611,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.31" +version = "1.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" +checksum = "40545c26d092346d8a8dab71ee48e7685a7a9cba76e634790c215b41a4a7b4cf" dependencies = [ "jobserver", "libc", @@ -637,6 +706,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "const_panic" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "013b6c2c3a14d678f38cd23994b02da3a1a1b6a5d1eedddfe63a5a5f11b13a81" + [[package]] name = "constant_time_eq" version = "0.3.1" @@ -668,6 +743,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "core_extensions" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92c71dc07c9721607e7a16108336048ee978c3a8b129294534272e8bac96c0ee" +dependencies = [ + "core_extensions_proc_macros", +] + +[[package]] +name = "core_extensions_proc_macros" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6" + [[package]] name = "cpufeatures" version = "0.2.14" @@ -686,6 +776,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.20" @@ -917,6 +1016,24 @@ dependencies = [ "paste", ] +[[package]] +name = "datafusion-ffi" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e923c459b53a26d92a8806d1f6a37fdf48bde51507a39eaed6f42a60f2bfd160" +dependencies = [ + "abi_stable", + "arrow", + "async-ffi", + "async-trait", + "datafusion", + "datafusion-proto", + "doc-comment", + "futures", + "log", + "prost", +] + [[package]] name = "datafusion-functions" version = "43.0.0" @@ -1176,6 +1293,7 @@ dependencies = [ "arrow", "async-trait", "datafusion", + "datafusion-ffi", "datafusion-functions-window-common", "datafusion-proto", "datafusion-substrait", @@ -1238,6 +1356,23 @@ dependencies = [ "subtle", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "dyn-clone" version = "1.0.17" @@ -1268,9 +1403,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" [[package]] name = "fixedbitset" @@ -1369,7 +1504,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -1402,6 +1537,15 @@ dependencies = [ "slab", ] +[[package]] +name = "generational-arena" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" +dependencies = [ + "cfg-if", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1477,9 +1621,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.0" +version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" [[package]] name = "heck" @@ -1631,14 +1775,143 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "idna" -version = "0.5.0" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "icu_normalizer", + "icu_properties", ] [[package]] @@ -1648,7 +1921,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.15.0", + "hashbrown 0.15.1", ] [[package]] @@ -1786,9 +2059,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.161" +version = "0.2.162" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" +checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" [[package]] name = "libflate" @@ -1814,6 +2087,16 @@ dependencies = [ "rle-decode-fast", ] +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + [[package]] name = "libm" version = "0.2.11" @@ -1836,6 +2119,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" + [[package]] name = "lock_api" version = "0.4.12" @@ -2289,7 +2578,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.87", ] [[package]] @@ -2328,7 +2617,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn", + "syn 2.0.87", "tempfile", ] @@ -2342,7 +2631,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -2365,9 +2654,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51" +checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" dependencies = [ "cfg-if", "indoc", @@ -2383,9 +2672,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" +checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" dependencies = [ "once_cell", "target-lexicon", @@ -2393,9 +2682,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d" +checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" dependencies = [ "libc", "pyo3-build-config", @@ -2403,34 +2692,34 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e" +checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn", + "syn 2.0.87", ] [[package]] name = "pyo3-macros-backend" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce" +checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" dependencies = [ "heck 0.5.0", "proc-macro2", "pyo3-build-config", "quote", - "syn", + "syn 2.0.87", ] [[package]] name = "quad-rand" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db" +checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" @@ -2479,9 +2768,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e346e016eacfff12233c243718197ca12f148c84e1e84268a896699b41c71780" +checksum = "7d5a626c6807713b15cac82a6acaccd6043c9a5408c24baae07611fec3f243da" dependencies = [ "cfg_aliases", "libc", @@ -2584,6 +2873,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "repr_offset" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" +dependencies = [ + "tstr", +] + [[package]] name = "reqwest" version = "0.12.9" @@ -2673,9 +2971,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.38" +version = "0.38.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" +checksum = "375116bee2be9ed569afe2154ea6a99dfdffd257f533f187498c2a8f5feaf4ee" dependencies = [ "bitflags 2.6.0", "errno", @@ -2788,7 +3086,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn", + "syn 2.0.87", ] [[package]] @@ -2812,9 +3110,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.0" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" +checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" dependencies = [ "core-foundation-sys", "libc", @@ -2852,7 +3150,7 @@ checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -2863,7 +3161,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -2887,7 +3185,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn", + "syn 2.0.87", ] [[package]] @@ -2971,7 +3269,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -3014,9 +3312,15 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -3048,7 +3352,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 2.0.87", ] [[package]] @@ -3061,7 +3365,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 2.0.87", ] [[package]] @@ -3085,7 +3389,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn", + "syn 2.0.87", "typify", "walkdir", ] @@ -3098,9 +3402,20 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.85" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", @@ -3116,6 +3431,17 @@ dependencies = [ "futures-core", ] +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "target-lexicon" version = "0.12.16" @@ -3124,9 +3450,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.13.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", "fastrand", @@ -3137,22 +3463,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.65" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d11abd9594d9b38965ef50805c5e469ca9cc6f197f883f717e0269a3057b3d5" +checksum = "02dd99dc800bbb97186339685293e1cc5d9df1f8fae2d0aecd9ff1c77efea892" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.65" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602" +checksum = "a7c61ec9a6f64d2793d8a45faba21efbe3ced62a886d44c36a009b2b519b4c7e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -3175,6 +3501,16 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinyvec" version = "1.8.0" @@ -3214,7 +3550,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -3266,7 +3602,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -3284,6 +3620,21 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "tstr" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" +dependencies = [ + "tstr_proc_macros", +] + +[[package]] +name = "tstr_proc_macros" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" + [[package]] name = "twox-hash" version = "1.6.3" @@ -3294,6 +3645,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + [[package]] name = "typed-builder" version = "0.16.2" @@ -3311,7 +3668,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", ] [[package]] @@ -3345,7 +3702,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn", + "syn 2.0.87", "thiserror", "unicode-ident", ] @@ -3363,31 +3720,16 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn", + "syn 2.0.87", "typify-impl", ] -[[package]] -name = "unicode-bidi" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" - [[package]] name = "unicode-ident" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" -[[package]] -name = "unicode-normalization" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" -dependencies = [ - "tinyvec", -] - [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -3420,15 +3762,27 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.2" +version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" dependencies = [ "form_urlencoded", "idna", "percent-encoding", ] +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "uuid" version = "1.11.0" @@ -3492,7 +3846,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.87", "wasm-bindgen-shared", ] @@ -3526,7 +3880,7 @@ checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3560,6 +3914,22 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.9" @@ -3569,6 +3939,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.52.0" @@ -3690,6 +4066,18 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "xz2" version = "0.1.7" @@ -3699,6 +4087,30 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yoke" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.35" @@ -3717,7 +4129,28 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.87", +] + +[[package]] +name = "zerofrom" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "synstructure", ] [[package]] @@ -3726,6 +4159,28 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "zstd" version = "0.12.4" diff --git a/Cargo.toml b/Cargo.toml index d86948b3a..02707b957 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ arrow = { version = "53", features = ["pyarrow"] } datafusion = { version = "43.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } datafusion-substrait = { version = "43.0.0", optional = true } datafusion-proto = { version = "43.0.0" } +datafusion-ffi = { version = "43.0.0" } datafusion-functions-window-common = { version = "43.0.0" } prost = "0.13" # keep in line with `datafusion-substrait` uuid = { version = "1.11", features = ["v4"] } diff --git a/docs/source/user-guide/io/index.rst b/docs/source/user-guide/io/index.rst index 05411327e..b885cfeda 100644 --- a/docs/source/user-guide/io/index.rst +++ b/docs/source/user-guide/io/index.rst @@ -26,3 +26,4 @@ IO csv json parquet + table_provider diff --git a/docs/source/user-guide/io/table_provider.rst b/docs/source/user-guide/io/table_provider.rst new file mode 100644 index 000000000..2ff9ae46f --- /dev/null +++ b/docs/source/user-guide/io/table_provider.rst @@ -0,0 +1,56 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Custom Table Provider +===================== + +If you have a custom data source that you want to integrate with DataFusion, you can do so by +implementing the `TableProvider `_ +interface in Rust and then exposing it in Python. To do so, +you must use DataFusion 43.0.0 or later and expose a `FFI_TableProvider `_ +via `PyCapsule `_. + +A complete example can be found in the `examples folder `_. + +.. code-block:: rust + + #[pymethods] + impl MyTableProvider { + + fn __datafusion_table_provider__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = CString::new("datafusion_table_provider").unwrap(); + + let provider = Arc::new(self.clone()) + .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; + let provider = FFI_TableProvider::new(Arc::new(provider), false); + + PyCapsule::new_bound(py, provider, Some(name.clone())) + } + } + +Once you have this library available, in python you can register your table provider +to the ``SessionContext``. + +.. code-block:: python + + provider = MyTableProvider() + ctx.register_table_provider("my_table", provider) + + ctx.table("my_table").show() diff --git a/examples/ffi-table-provider/.cargo/config.toml b/examples/ffi-table-provider/.cargo/config.toml new file mode 100644 index 000000000..91a099a61 --- /dev/null +++ b/examples/ffi-table-provider/.cargo/config.toml @@ -0,0 +1,12 @@ +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + diff --git a/examples/ffi-table-provider/Cargo.lock b/examples/ffi-table-provider/Cargo.lock new file mode 100644 index 000000000..3b57cac75 --- /dev/null +++ b/examples/ffi-table-provider/Cargo.lock @@ -0,0 +1,3175 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "abi_stable" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" +dependencies = [ + "abi_stable_derive", + "abi_stable_shared", + "const_panic", + "core_extensions", + "crossbeam-channel", + "generational-arena", + "libloading", + "lock_api", + "parking_lot", + "paste", + "repr_offset", + "rustc_version", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "abi_stable_derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" +dependencies = [ + "abi_stable_shared", + "as_derive_utils", + "core_extensions", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", + "typed-arena", +] + +[[package]] +name = "abi_stable_shared" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" +dependencies = [ + "core_extensions", +] + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "const-random", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "allocator-api2" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anyhow" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "arrow" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4caf25cdc4a985f91df42ed9e9308e1adbcd341a31a72605c697033fcef163e3" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91f2dfd1a7ec0aca967dfaa616096aec49779adc8eccec005e2f5e4111b1192a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "num", +] + +[[package]] +name = "arrow-array" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d39387ca628be747394890a6e47f138ceac1aa912eab64f02519fed24b637af8" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.14.5", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e51e05228852ffe3eb391ce7178a0f97d2cf80cc6ef91d3c4a6b3cb688049ec" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d09aea56ec9fa267f3f3f6cdab67d8a9974cbba90b3aa38c8fe9d0bb071bd8c1" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c07b5232be87d115fde73e32f2ca7f1b353bff1b44ac422d3c6fc6ae38f11f0d" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "lexical-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98ae0af50890b494cebd7d6b04b35e896205c1d1df7b29a6272c5d0d0249ef5" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ed91bdeaff5a1c00d28d8f73466bcb64d32bbd7093b5a30156b4b9f4dba3eee" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "flatbuffers", + "lz4_flex", +] + +[[package]] +name = "arrow-json" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0471f51260a5309307e5d409c9dc70aede1cd9cf1d4ff0f0a1e8e1a2dd0e0d3c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap", + "lexical-core", + "num", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-ord" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2883d7035e0b600fb4c30ce1e50e66e53d8656aa729f2bfa4b51d359cf3ded52" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "half", + "num", +] + +[[package]] +name = "arrow-row" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "552907e8e587a6fde4f8843fd7a27a576a260f65dab6c065741ea79f633fc5be" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "539ada65246b949bd99ffa0881a9a15a4a529448af1a07a9838dd78617dafab1" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "arrow-select" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6259e566b752da6dceab91766ed8b2e67bf6270eb9ad8a6e07a33c1bede2b125" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3179ccbd18ebf04277a095ba7321b93fd1f774f18816bd5f6b3ce2f594edb6c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] + +[[package]] +name = "as_derive_utils" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" +dependencies = [ + "core_extensions", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "async-compression" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" +dependencies = [ + "bzip2", + "flate2", + "futures-core", + "futures-io", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", +] + +[[package]] +name = "async-ffi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" +dependencies = [ + "abi_stable", +] + +[[package]] +name = "async-trait" +version = "0.1.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "backtrace" +version = "0.3.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", +] + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "brotli" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "4.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" + +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "cc" +version = "1.1.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40545c26d092346d8a8dab71ee48e7685a7a9cba76e634790c215b41a4a7b4cf" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "windows-targets", +] + +[[package]] +name = "chrono-tz" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf", +] + +[[package]] +name = "chrono-tz-build" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7" +dependencies = [ + "parse-zoneinfo", + "phf_codegen", +] + +[[package]] +name = "comfy-table" +version = "7.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +dependencies = [ + "strum", + "strum_macros", + "unicode-width", +] + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "const_panic" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "013b6c2c3a14d678f38cd23994b02da3a1a1b6a5d1eedddfe63a5a5f11b13a81" + +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "core_extensions" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92c71dc07c9721607e7a16108336048ee978c3a8b129294534272e8bac96c0ee" +dependencies = [ + "core_extensions_proc_macros", +] + +[[package]] +name = "core_extensions_proc_macros" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6" + +[[package]] +name = "cpufeatures" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbba0799cf6913b456ed07a94f0f3b6e12c62a5d88b10809e2284a0f2b915c05" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-ipc", + "arrow-schema", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "dashmap", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-sql", + "flate2", + "futures", + "glob", + "half", + "hashbrown 0.14.5", + "indexmap", + "itertools", + "log", + "num_cpus", + "object_store", + "parking_lot", + "parquet", + "paste", + "pin-project-lite", + "rand", + "sqlparser", + "tempfile", + "tokio", + "tokio-util", + "url", + "uuid", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7493c5c2d40eec435b13d92e5703554f4efc7059451fcb8d3a79580ff0e45560" +dependencies = [ + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", +] + +[[package]] +name = "datafusion-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24953049ebbd6f8964f91f60aa3514e121b5e81e068e33b60e77815ab369b25c" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.14.5", + "indexmap", + "instant", + "libc", + "num_cpus", + "object_store", + "parquet", + "paste", + "sqlparser", + "tokio", +] + +[[package]] +name = "datafusion-common-runtime" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f06df4ef76872e11c924d3c814fd2a8dd09905ed2e2195f71c857d78abd19685" +dependencies = [ + "log", + "tokio", +] + +[[package]] +name = "datafusion-execution" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bbdcb628d690f3ce5fea7de81642b514486d58ff9779a51f180a69a4eadb361" +dependencies = [ + "arrow", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-expr", + "futures", + "hashbrown 0.14.5", + "log", + "object_store", + "parking_lot", + "rand", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8036495980e3131f706b7d33ab00b4492d73dc714e3cb74d11b50f9602a73246" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "chrono", + "datafusion-common", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap", + "paste", + "serde_json", + "sqlparser", + "strum", + "strum_macros", +] + +[[package]] +name = "datafusion-expr-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4da0f3cb4669f9523b403d6b5a0ec85023e0ab3bf0183afd1517475b3e64fdd2" +dependencies = [ + "arrow", + "datafusion-common", + "itertools", + "paste", +] + +[[package]] +name = "datafusion-ffi" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e923c459b53a26d92a8806d1f6a37fdf48bde51507a39eaed6f42a60f2bfd160" +dependencies = [ + "abi_stable", + "arrow", + "async-ffi", + "async-trait", + "datafusion", + "datafusion-proto", + "doc-comment", + "futures", + "log", + "prost", +] + +[[package]] +name = "datafusion-functions" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52c4012648b34853e40a2c6bcaa8772f837831019b68aca384fb38436dba162" +dependencies = [ + "arrow", + "arrow-buffer", + "base64", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "hashbrown 0.14.5", + "hex", + "itertools", + "log", + "md-5", + "rand", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5b8bb624597ba28ed7446df4a9bd7c7a7bde7c578b6b527da3f47371d5f6741" +dependencies = [ + "ahash", + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "indexmap", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fb06208fc470bc8cf1ce2d9a1159d42db591f2c7264a8c1776b53ad8f675143" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", + "rand", +] + +[[package]] +name = "datafusion-functions-nested" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fca25bbb87323716d05e54114666e942172ccca23c5a507e9c7851db6e965317" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-physical-expr-common", + "itertools", + "log", + "paste", + "rand", +] + +[[package]] +name = "datafusion-functions-window" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ae23356c634e54c59f7c51acb7a5b9f6240ffb2cf997049a1a24a8a88598dbe" +dependencies = [ + "datafusion-common", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b3d6ff7794acea026de36007077a06b18b89e4f9c3fea7f2215f9f7dd9059b" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-optimizer" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec6241eb80c595fa0e1a8a6b69686b5cf3bd5fdacb8319582a0943b0bd788aa" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "hashbrown 0.14.5", + "indexmap", + "itertools", + "log", + "paste", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3370357b8fc75ec38577700644e5d1b0bc78f38babab99c0b8bd26bafb3e4335" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "arrow-string", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.14.5", + "indexmap", + "itertools", + "log", + "paste", + "petgraph", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8b7734d94bf2fa6f6e570935b0ddddd8421179ce200065be97874e13d46a47b" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.14.5", + "rand", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eee8c479522df21d7b395640dff88c5ed05361852dce6544d7c98e9dbcebffe" +dependencies = [ + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-execution", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-plan", + "itertools", +] + +[[package]] +name = "datafusion-physical-plan" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17e1fc2e2c239d14e8556f2622b19a726bf6bc6962cc00c71fc52626274bee24" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.14.5", + "indexmap", + "itertools", + "log", + "once_cell", + "parking_lot", + "pin-project-lite", + "rand", + "tokio", +] + +[[package]] +name = "datafusion-proto" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f730f7fc5a20134d4e5ecdf7bbf392002ac58163d58423ea28a702dc077b06e1" +dependencies = [ + "arrow", + "chrono", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-proto-common", + "object_store", + "prost", +] + +[[package]] +name = "datafusion-proto-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12c225fe49e4f943e35446b263613ada7a9e9f8d647544e6b07037b9803567df" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "object_store", + "prost", +] + +[[package]] +name = "datafusion-sql" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e3a4ed41dbee20a5d947a59ca035c225d67dc9cbe869c10f66dcdf25e7ce51" +dependencies = [ + "arrow", + "arrow-array", + "arrow-schema", + "datafusion-common", + "datafusion-expr", + "indexmap", + "log", + "regex", + "sqlparser", + "strum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "fastrand" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" + +[[package]] +name = "ffi-table-provider" +version = "0.1.0" +dependencies = [ + "arrow", + "arrow-array", + "arrow-schema", + "datafusion", + "datafusion-ffi", + "pyo3", + "pyo3-build-config", +] + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "flatbuffers" +version = "24.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" +dependencies = [ + "bitflags 1.3.2", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.0.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "generational-arena" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] + +[[package]] +name = "hashbrown" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "iana-time-zone" +version = "0.1.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +dependencies = [ + "equivalent", + "hashbrown 0.15.1", +] + +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lexical-core" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "libc" +version = "0.2.162" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "libm" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "litemap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "lz4_flex" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.36.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eb4c22c6154a1e759d7099f9ffad7cc5ef8245f9efbab4a41b92623079c82f3" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "humantime", + "itertools", + "parking_lot", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "parquet" +version = "53.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dea02606ba6f5e856561d8d507dba8bac060aefca2a6c0f1aa1d361fed91ff3e" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.14.5", + "lz4_flex", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", + "zstd-sys", +] + +[[package]] +name = "parse-zoneinfo" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24" +dependencies = [ + "regex", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" + +[[package]] +name = "portable-atomic" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "pyo3" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "repr_offset" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" +dependencies = [ + "tstr", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.38.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99e4ea3e1cdc4b559b8e5650f9c8e5998e3e5c1343b4eaf034565f32318d63c0" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustversion" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "semver" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" + +[[package]] +name = "seq-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" + +[[package]] +name = "serde" +version = "1.0.214" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.214" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "serde_json" +version = "1.0.132" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "snafu" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" +dependencies = [ + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "sqlparser" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fe11944a61da0da3f592e19a45ebe5ab92dc14a779907ff1f08fbb797bfefc7" +dependencies = [ + "log", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.87", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "tempfile" +version = "3.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" +dependencies = [ + "cfg-if", + "fastrand", + "once_cell", + "rustix", + "windows-sys 0.59.0", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.41.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" +dependencies = [ + "backtrace", + "bytes", + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "tokio-util" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + +[[package]] +name = "tstr" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" +dependencies = [ + "tstr_proc_macros", +] + +[[package]] +name = "tstr_proc_macros" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" + +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" + +[[package]] +name = "url" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "uuid" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" +dependencies = [ + "getrandom", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.87", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" + +[[package]] +name = "web-sys" +version = "0.3.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + +[[package]] +name = "yoke" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "zerofrom" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "synstructure", +] + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "zstd" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.13+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/examples/ffi-table-provider/Cargo.toml b/examples/ffi-table-provider/Cargo.toml new file mode 100644 index 000000000..4e54eaf03 --- /dev/null +++ b/examples/ffi-table-provider/Cargo.toml @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "ffi-table-provider" +version = "0.1.0" +edition = "2021" + +[dependencies] +datafusion = { version = "43.0.0" } +datafusion-ffi = { version = "43.0.0" } +pyo3 = { version = "0.22.6", features = ["extension-module", "abi3", "abi3-py38"] } +arrow = { version = "53.2.0" } +arrow-array = { version = "53.2.0" } +arrow-schema = { version = "53.2.0" } + +[build-dependencies] +pyo3-build-config = "0.22.6" + +[lib] +name = "ffi_table_provider" +crate-type = ["cdylib", "rlib"] diff --git a/examples/ffi-table-provider/build.rs b/examples/ffi-table-provider/build.rs new file mode 100644 index 000000000..4878d8b0e --- /dev/null +++ b/examples/ffi-table-provider/build.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +fn main() { + pyo3_build_config::add_extension_module_link_args(); +} diff --git a/examples/ffi-table-provider/pyproject.toml b/examples/ffi-table-provider/pyproject.toml new file mode 100644 index 000000000..116efae9c --- /dev/null +++ b/examples/ffi-table-provider/pyproject.toml @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[build-system] +requires = ["maturin>=1.6,<2.0"] +build-backend = "maturin" + +[project] +name = "ffi_table_provider" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dynamic = ["version"] + +[tool.maturin] +features = ["pyo3/extension-module"] diff --git a/examples/ffi-table-provider/python/tests/_test_table_provider.py b/examples/ffi-table-provider/python/tests/_test_table_provider.py new file mode 100644 index 000000000..56c05e4fa --- /dev/null +++ b/examples/ffi-table-provider/python/tests/_test_table_provider.py @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from datafusion import SessionContext +from ffi_table_provider import MyTableProvider +import pyarrow as pa + + +def test_table_loading(): + ctx = SessionContext() + table = MyTableProvider(3, 2, 4) + ctx.register_table_provider("t", table) + result = ctx.table("t").collect() + + assert len(result) == 4 + assert result[0].num_columns == 3 + + result = [r.column(0) for r in result] + expected = [ + pa.array([0, 1], type=pa.int32()), + pa.array([2, 3, 4], type=pa.int32()), + pa.array([4, 5, 6, 7], type=pa.int32()), + pa.array([6, 7, 8, 9, 10], type=pa.int32()), + ] + + assert result == expected diff --git a/examples/ffi-table-provider/src/lib.rs b/examples/ffi-table-provider/src/lib.rs new file mode 100644 index 000000000..473244d88 --- /dev/null +++ b/examples/ffi-table-provider/src/lib.rs @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ffi::CString, sync::Arc}; + +use arrow_array::ArrayRef; +use datafusion::{ + arrow::{ + array::RecordBatch, + datatypes::{DataType, Field, Schema}, + }, + datasource::MemTable, + error::{DataFusionError, Result}, +}; +use datafusion_ffi::table_provider::FFI_TableProvider; +use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyCapsule}; + +/// In order to provide a test that demonstrates different sized record batches, +/// the first batch will have num_rows, the second batch num_rows+1, and so on. +#[pyclass(name = "MyTableProvider", module = "ffi_table_provider", subclass)] +#[derive(Clone)] +struct MyTableProvider { + num_cols: usize, + num_rows: usize, + num_batches: usize, +} + +fn create_record_batch( + schema: &Arc, + num_cols: usize, + start_value: i32, + num_values: usize, +) -> Result { + let end_value = start_value + num_values as i32; + let row_values: Vec = (start_value..end_value).collect(); + + let columns: Vec<_> = (0..num_cols) + .map(|_| { + std::sync::Arc::new(arrow::array::Int32Array::from(row_values.clone())) as ArrayRef + }) + .collect(); + + RecordBatch::try_new(Arc::clone(schema), columns).map_err(DataFusionError::from) +} + +impl MyTableProvider { + fn create_table(&self) -> Result { + let fields: Vec<_> = (0..self.num_cols) + .map(|idx| (b'A' + idx as u8) as char) + .map(|col_name| Field::new(col_name, DataType::Int32, true)) + .collect(); + + let schema = Arc::new(Schema::new(fields)); + + let batches: Result> = (0..self.num_batches) + .map(|batch_idx| { + let start_value = batch_idx * self.num_rows; + create_record_batch( + &schema, + self.num_cols, + start_value as i32, + self.num_rows + batch_idx, + ) + }) + .collect(); + + MemTable::try_new(schema, vec![batches?]) + } +} + +#[pymethods] +impl MyTableProvider { + #[new] + fn new(num_cols: usize, num_rows: usize, num_batches: usize) -> Self { + Self { + num_cols, + num_rows, + num_batches, + } + } + + fn __datafusion_table_provider__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = CString::new("datafusion_table_provider").unwrap(); + + let provider = self + .create_table() + .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; + let provider = FFI_TableProvider::new(Arc::new(provider), false); + + PyCapsule::new_bound(py, provider, Some(name.clone())) + } +} + +#[pymodule] +fn ffi_table_provider(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + Ok(()) +} diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 5221c866c..a07b5d175 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -685,6 +685,14 @@ def deregister_table(self, name: str) -> None: """Remove a table from the session.""" self.ctx.deregister_table(name) + def register_table_provider(self, name: str, provider: Any) -> None: + """Register a table provider. + + This table provider must have a method called ``__datafusion_table_provider__`` + which returns a PyCapsule that exposes a ``FFI_TableProvider``. + """ + self.ctx.register_table_provider(name, provider) + def register_record_batches( self, name: str, partitions: list[list[pyarrow.RecordBatch]] ) -> None: diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 330475302..b82f95e35 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -306,6 +306,7 @@ def test_unnest_without_nulls(nested_df): assert result.column(1) == pa.array([7, 8, 8, 9, 9, 9]) +@pytest.mark.filterwarnings("ignore:`join_keys`:DeprecationWarning") def test_join(): ctx = SessionContext() diff --git a/src/context.rs b/src/context.rs index c2a263fa7..8675e97df 100644 --- a/src/context.rs +++ b/src/context.rs @@ -28,7 +28,7 @@ use object_store::ObjectStore; use url::Url; use uuid::Uuid; -use pyo3::exceptions::{PyKeyError, PyTypeError, PyValueError}; +use pyo3::exceptions::{PyKeyError, PyNotImplementedError, PyTypeError, PyValueError}; use pyo3::prelude::*; use crate::catalog::{PyCatalog, PyTable}; @@ -67,7 +67,8 @@ use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, }; -use pyo3::types::{PyDict, PyList, PyTuple}; +use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; +use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple}; use tokio::task::JoinHandle; /// Configuration options for a SessionContext @@ -566,6 +567,30 @@ impl PySessionContext { Ok(()) } + /// Construct datafusion dataframe from Arrow Table + pub fn register_table_provider( + &mut self, + name: &str, + provider: Bound<'_, PyAny>, + ) -> PyResult<()> { + if provider.hasattr("__datafusion_table_provider__")? { + let capsule = provider.getattr("__datafusion_table_provider__")?.call0()?; + let capsule = capsule.downcast::()?; + // validate_pycapsule(capsule, "arrow_array_stream")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignTableProvider = provider.into(); + + let _ = self.ctx.register_table(name, Arc::new(provider))?; + + Ok(()) + } else { + Err(PyNotImplementedError::new_err( + "__datafusion_table_provider__ does not exist on Table Provider object.", + )) + } + } + pub fn register_record_batches( &mut self, name: &str, From 92b093c767a8ec76918db6abeae35f85ee19fa60 Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 23 Nov 2024 23:21:24 +0800 Subject: [PATCH 076/248] Add make_list and tests for make_list, make_array (#949) --- python/datafusion/functions.py | 9 +++++++++ python/tests/test_functions.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 88ea7280d..6ad4c50c2 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -184,6 +184,7 @@ "lpad", "ltrim", "make_array", + "make_list", "make_date", "max", "md5", @@ -1044,6 +1045,14 @@ def make_array(*args: Expr) -> Expr: return Expr(f.make_array(args)) +def make_list(*args: Expr) -> Expr: + """Returns an array using the specified input expressions. + + This is an alias for :py:func:`make_array`. + """ + return make_array(*args) + + def array(*args: Expr) -> Expr: """Returns an array using the specified input expressions. diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index c14cfc2dc..0d40032bb 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -576,6 +576,37 @@ def test_array_function_cardinality(): ) +@pytest.mark.parametrize("make_func", [f.make_array, f.make_list]) +def test_make_array_functions(make_func): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array(["Hello", "World", "!"], type=pa.string()), + pa.array([4, 5, 6]), + pa.array(["hello ", " world ", " !"], type=pa.string()), + ], + names=["a", "b", "c"], + ) + df = ctx.create_dataframe([[batch]]) + + stmt = make_func( + column("a").cast(pa.string()), + column("b").cast(pa.string()), + column("c").cast(pa.string()), + ) + py_expr = [ + ["Hello", "4", "hello "], + ["World", "5", " world "], + ["!", "6", " !"], + ] + + query_result = df.select(stmt).collect()[0].column(0) + for a, b in zip(query_result, py_expr): + np.testing.assert_array_equal( + np.array(a.as_py(), dtype=str), np.array(b, dtype=str) + ) + + @pytest.mark.parametrize( ("stmt", "py_expr"), [ From 54e5e0d9cc876ca31eea6f79a623e5163eae75f9 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Sat, 23 Nov 2024 16:22:03 +0100 Subject: [PATCH 077/248] fix: udwf example (#948) --- examples/python-udwf.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 05b3021d8..55de2bdc7 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -185,35 +185,36 @@ def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: df = ctx.create_dataframe([[batch]]) exp_smooth = udwf( - ExponentialSmoothDefault(0.9), + lambda: ExponentialSmoothDefault(0.9), pa.float64(), pa.float64(), volatility="immutable", ) smooth_two_row = udwf( - SmoothBoundedFromPreviousRow(0.9), + lambda: SmoothBoundedFromPreviousRow(0.9), pa.float64(), pa.float64(), volatility="immutable", ) smooth_rank = udwf( - SmoothAcrossRank(0.9), + lambda: SmoothAcrossRank(0.9), pa.float64(), pa.float64(), volatility="immutable", ) smooth_frame = udwf( - ExponentialSmoothFrame(0.9), + lambda: ExponentialSmoothFrame(0.9), pa.float64(), pa.float64(), volatility="immutable", + name="smooth_frame", ) smooth_two_col = udwf( - SmoothTwoColumn(0.9), + lambda: SmoothTwoColumn(0.9), [pa.float64(), pa.int64()], pa.float64(), volatility="immutable", From deb1f255ac314d569ee9299dfd421028763bbbb9 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 29 Nov 2024 11:57:42 -0500 Subject: [PATCH 078/248] Documentation updates: simplify examples and add section on data sources (#955) * Add a simple example to the introduction page to demonstrate loading a dataframe from a csv file and displaying the contents * Update basics doc to be a little more straight forward * Move downloading of data files for examples into the build scripts and just point the users to where these files are located instead of adding url lib requests to the python examples so we can focus on what is most important to the user * Handle a few errors generated by doc site builder * Switch example so that there is not confusion about the single and double quotes due to capitalization * Add section on data sources * Build pipeline doesn't have polars and it isn't really necessary for the example, so swith to a code block instead of ipython directive --- .github/workflows/docs.yaml | 2 + docs/.gitignore | 2 + docs/build.sh | 11 +- docs/source/images/jupyter_lab_df_view.png | Bin 0 -> 150303 bytes docs/source/index.rst | 25 +-- docs/source/user-guide/basics.rst | 74 +++---- .../common-operations/aggregations.rst | 10 +- .../common-operations/functions.rst | 6 - .../user-guide/common-operations/index.rst | 2 + .../common-operations/select-and-filter.rst | 11 +- .../user-guide/common-operations/windows.rst | 6 - docs/source/user-guide/data-sources.rst | 187 ++++++++++++++++++ docs/source/user-guide/introduction.rst | 34 ++++ docs/source/user-guide/io/avro.rst | 2 + docs/source/user-guide/io/csv.rst | 2 + docs/source/user-guide/io/json.rst | 2 + docs/source/user-guide/io/parquet.rst | 3 +- docs/source/user-guide/io/table_provider.rst | 2 + python/datafusion/dataframe.py | 2 +- python/datafusion/plan.py | 4 +- 20 files changed, 300 insertions(+), 87 deletions(-) create mode 100644 docs/source/images/jupyter_lab_df_view.png create mode 100644 docs/source/user-guide/data-sources.rst diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index e47497b2a..86288e2d8 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -75,6 +75,8 @@ jobs: set -x source venv/bin/activate cd docs + curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv + curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet make html - name: Copy & push the generated HTML diff --git a/docs/.gitignore b/docs/.gitignore index 41e135341..6e8a53b6f 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,2 +1,4 @@ pokemon.csv yellow_trip_data.parquet +yellow_tripdata_2021-01.parquet + diff --git a/docs/build.sh b/docs/build.sh index 5afe85812..31398d195 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -19,8 +19,17 @@ # set -e + +if [ ! -f pokemon.csv ]; then + curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv +fi + +if [ ! -f yellow_tripdata_2021-01.parquet ]; then + curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet +fi + rm -rf build 2> /dev/null rm -rf temp 2> /dev/null mkdir temp cp -rf source/* temp/ -make SOURCEDIR=`pwd`/temp html \ No newline at end of file +make SOURCEDIR=`pwd`/temp html diff --git a/docs/source/images/jupyter_lab_df_view.png b/docs/source/images/jupyter_lab_df_view.png new file mode 100644 index 0000000000000000000000000000000000000000..9dafb4f61d3a82b6f51e03fa43f3c97a4f8bde0f GIT binary patch literal 150303 zcma&M1yozj)&NQkDGtTGP~6=q1b5fs1P{SoT8b5?c(I}_UR;W_xCgfYDeeTfATRyy z{bjwk*1P{%=j6kBs( z6cj8e2LM1*2>_tgbO+lyI0I2o6hEaJpc(2666cy|)77Iq)s_K=zE45Xma%_QMDkl+ z@o5-=HEq=VgxrfcPU0W=<*%cu@Wia_@DGBtWTlNrFuga`UhcG32ZB4mM|W3we2*d? zMi+BE^z3nFk zo>9Q4bjbze;nA4?CRCgnMUlV9;ZEF)vnLYe7n`4zk|c^O8CX)loW1+UQxFCg-dZ?s zcK)wa-t3ZJhz-G$8t162C$#16EuXParWa`;%zuDyu^S|#a%2PZ(NVlmdpcTLl5g<7 z$v8g)_23PkXF5pj*xae{ffSR6M@hS)TLv&iG&7jSR@D(wQzxA&F7ck@kwg_`nV-iE z7qTr^T>HQHR&sy=sPFg;!;!(&@ zvJZWV)jW0B?Xtk>aSi_(-W8}W-s6wQ^=jiqFAGReg|VMCDBo+BKu~4S{EzZRFtCGK zxsK3K`F<|*X}mU#4%<+yMrg*%8o3YLJOow(QB-HO>s_gWg)zJfNgo9X94gp@F{7+s z6eOaAOX??bV#Mn$rf+RwkC8yLSvF)@Pd?m@Tx-e&|FEStpq9$UoMptTAwczd?q^Mp zy4^+c9F=7&-V0L}!%aB{ckS-IVXt8}>)0p##dX`zY&wC^#H_UR94S_>4G3I_|`fu~)wE4OC8-*zxi-_brHdy7JuGV&*i z!7l;bZ2c|%HR0{EV^MFEG)d&Js`@AJ0X z?!8bxH^^_lz$PZ~H}!j>gL{L57-dg}UCxb^`xV#WpK1NwWD+2)E zkqDvSG_##|cin%r6B?*ZbsMhjx>iDf`L0(Tm=BrNU#0NIu4&ZV_!WK0`2tOhW}lPV zQowgw)a+OQNJt^<9utEkox4LCdx)*jYR={z4qkg>m7`%DFwDMhu;>T++IvYPRaZ%a zYfgP@^W%?c9#Hct*pvO#1=Jdfg7U}1-NNF}jt}Pn0}4Xo&!yC9PMu|!M&wHrlKA7I z&re@O4s$-gPScDOQ==h$=3(({d;RIs2ccs$X<97V559?5j9oi?tD@=eOD#|{FqPku z8KUZcys#jgf9V}6@dxEQTK_KxSDbgKpg*Y1Xa*W2zE5s{i5-(k;}OUJcA_M$LJePE zMqzfz1}4!mJpCm1IZ2n{89tr14Jt1#3qYTCKZ!$8pq54?G+UnY_2B^32L9y7HhIZ> z(z0mBLiz7Rq6EMKc7Dn@1*WfLJFn`|a}qeE=L+<8T_g1U#WI5>uH{4yex^y zw89}Jw6@kVe3=%n^ux%IML%B8vcZr(aBS`eP6Y`fiqK7T>=)IampHwIuGEq*^1`|L z=GMhdMBB(ZBc!?z$KdDWzY{Gqj$gZ?TVs%gm4*?>h**lPa<8&4shEcO$iCJ}w^uuV zAHb8DA(yTGPIc++l8U&TGm|#~LW$wCWr-jP`=|D2RxN_=q%8Ns01@C{r6)5xSh;#kFs~mlZpH8`7t$X4Nw-Xa!uP z>`~1K@sp26Z)vFHZ%(C+mpRzk<2ukx?>NRgQVMh|Iyby*5N#-TK#o8mzlQ*c)6rzx_k~Ki$=b=6lWR_+ zbBkqO^5x0#6QjG%q6v}-9dW^FE5&Nljk8J{vEja#gH+I#2w z&D)DxTrAgTVjoFAmZDN%kCC6_%uydnWDB)V+H8#e!##J@q;G319Mb<&b z4|?r-O%1sM{jCG7eYYLhqkRzDXtmT)FOvd}DM6_BM^F}(Y`8`6b`rV=Fiz*6J@Xv|mzGe%S;J4LATFL9Sp&kFmhvA*(~%b}ZMIiO(r8oRi2rAtPdp!okf$t62u%sufjOObi>S~c4tk&Bd2Z^KPT>w>R#9Nxa|_} zvW*akERX09uf`3bv?QVXWKa1q9)qIJedJzoSW(L+=vCC3Y-OxJ9*9cf&8n(>0kAv9xGq>|*CAXhzE4^qvTydT07Rj_~_R6{%lzE0Z*ODyU4szQ4^GVir@$K2p ziR`K_igd*cCC(bVG$i~k7tK{CF6%0Nuxty7eAlktPpE_*jMRBWlApwOEm-LUSiB7O zNL*euMMLxYl*(22*K7Qoe7UY4?)2`cKO|U$|1oOSxQ#wP!pe(HC6mYu&fLswmuYlu zKi1Q?pDn4lTF_b$t3`AF?0&iR*+a)8A*x<>pE&00K*nU223ZgQAP z<2Ici;VJoP!)iic%(VP%%(>PFu(aIo$kIcCB)Iq?~>`R&ZjTtg=gyOQ~2 z>vYg=AFu~l02E3`ASH;5hulQ;SgN)DY^7BNtLcUJMbQk&u<5W+Wqwss)vh5$O`~N_ zZ!2)WqXP0Sr114uD{!&pX2H-E#Wh8h%Q?*O=b*jLLWxFJcorCpv7l`L)YI!oZEmpG zp8|b?jy?9>Rr>$@eOY^G?i1zhc@Vq2@a|B{w0+ohZj!uS-gv4m;O?kV;zIzrBb8&* z!OXHrBkn=+Q9y^3*W>83RkSZSytuQ}wc@&d8NtooBa^8*Ctd=H%&j`8%us(O@EHI13ZohkJ2T_eKjA(k1c0t z8bb~aUE#tH%*R^8>&^XFqJBACxf>EUlLC({hYmM<$#yf#oXh?Gp#h7z2>~k?GIwhp z;K95vA^IKlettV#kDbQ1eV1PCpWDcN+ppsuvZ%nOFu@CjE2+SA#L*dI68DzIJJ=>z z{E=mMst%&lnb=Wr?}%tPF}ZF+JZpLO%&IDk?UCr0h1}HC6W{h{D8{@fp1zWW@jBU$ z8!xo7jW*Hmm@0(nhhoG~8jc?&N=P1-m!4X{1by#9Z}--pM3p?jgJX} zei0xa->*;q3mUcd>$CqVKeb1;p-Afhl$4Nf9cyBFlfPxn9%$1LEl<@!C*LlNJDW2hs|1a&vOOmV8M|ODpbfV=JmHEC28A$S;Z4 z_MV=uqFh`)K0cg2e4Jo+J1!m(5fLtKUM^l<4kQGJho6h5r7wqz2mQYW`LA(gfgaZG z4z8XKU>DlI<62sQy*wpezy3SXe_#LlpFm%S|C!0f$vS|LG(*lIDaa!z{tx{Br{#ZU{9io{|EDL95D)+V?)tx){@-2oJb>;1 zFbFxPr{w>v*S|ae@5X<36zBSj`u`%uzv%o=Et1fZFU7h3d(k9cPI6u|A=i=8K~_T- z`9|)tztfT|r4ZxbN56!z!=s5(Ab0597X#U~hl+n)c~`yseeqMpV=UvJa)8N4@`#Hj;!Re2j`M$*96k08614BMhFRx7qJ;$Dwb`Zm&t!dqPpX0h zjoY-=D*6A%tnbq=sfUwt6>ZBo2*14S6>-+Cv z`r>|dKHkF*GVOA;?Chd&x6({p202@I3TBDs%tP)EQmX@j`%U_?7P`+n9xkfwq#kb| zCh0Ix({e45!MmCZ6SmFg!58J`K%r$%oxW3_PK2r8enp*ab;$jG9Y@nr?H$7*8S|69 zrf+${q5(%9d1B-Hs0c@VLkkGx3U~?azzKJEvlyr|V|66M{LC zLH>e>gSsnkYn}6*7_JUs#I8ojV(U*+?}axez*Ifs&VI|y#|QXhv>oHKA^Y4uZzw2z zIT&%++iQM1nQlvB=DTXoTTrfU^fWo2lKjP8AGQ&4UPJFmB_^UPnK!;W`H{r?AveS3 zwd)B|9fti@_p7)e!U0HL;)@QL6_9)T+}8DJg4BcWC350cIm^O0!689d;Mpv^`E=j> zQ512S-nq1RSDW6>9x&=;Hg#K>>i@jsG_~5s2NvXG{;BQAt9n@9Jm{ihEXSECpH}p0 z$%Ai2A-R{>gdGa*yg%3;PnWu#NT?m=>TIjtA5(-)#0jlrn1telZS~k;$ zcg`z*9jhv-$cx97jqygH}AJ)c77$xyL7d|5m)Z>U?aUZ$ljYgdRD^i);<( zn}8ae7wa7sV$9A9W-r~0{Qi7EtK}OLE*uyW+0IVpsOcdfi^vYTTK0KYD|{T-cGV6C z@zv%A-|kXD>7PL~2VZFZlDgSQ9&G1>h=O+p_X1 z)>qy2YUU@=1~y{wHVI52(w2~x5{A?alV=^iDIOfxljKmux2Mzb%@`D7J z^#vU}3qhUh3WCXg#%_~oXU^Q0j#q!Z|J5{{$Fkv8@|ArX+)w4hzDTgFljW}v_~#kT zfPGVDw*Ou!y-%psb37Qsp)ppWey(9&s345k#5sxM$FHYpAPzLy_opX%)&6=WdvotR z)EcL?RT@B9dr}2vaUBX}L4q?PSbGnBHI(Gj?U$cj)*~v)t2PA{i zX8TpX+rHRVulzkB@_hL~hkrc^4%Xp6z1!?SEW>Sw+3R9q)5e{bi%xv6@)&*Zg4YQ1 ze&iX?GN71{UePn<`mSMF>fYVML-O*E`QIefoL*FbD7H>9&97y=r?nI=yasc;7g@x% zGOTY!eRy-itXiQ#9k(;pobhDg3Y*{aR*~EXjTO&myYA&0yPUViMJx=szr!+OUZ^x3 ziKBAs!C#<&UH44}+|Bn(rUjkEm^&YaY>UA%PwdFk4+9=dmgj)Hp#|bs&(VMFi1BhR$@uhixj378)h6i7TdFwZw zl^xmXDrP>5!7|cgnU%tMb3LIzv#c*)tlfJoz3_3WuNkxTr6;jjS$$nu8ej5YVt8^- zz{(kQ!t0iJ5BDiA?^78G7v0PmvUx%c@gyqWKOYEJu6&8%k9Jas1+4xSGPORn53zhp zCM&0DX(wd|et!&Q5M{}Qx(h9f59}XUfue;8$mJW=hoT?tG?P(D(km%fd&AqW#uU*} z{j1mYaj?&A0~Fib0K|ngk{6{KL`GkEt|tkS|8wx#ld?$&9zW}6mlvko?rHIC@A zM0Nkly)X2MWv&k2*S!RgTTiVy5zB}$k-t?sqGC+A^CS*z0}p^9KBMVr{b0~=wqO4q zM~l7O_v`b`fP&=bpj{R7vJ^?Pl&~VwOZy8gscxFb>;8qRpGS*UVYSIgk!sOcMvzI_ z{1N77D-CXVvC2qxgoRc7VGiw9%sf9G8;4zQFi9;azwAADf<`%hE3ubaYCphR!n{O^ zp~Mi+7_LhDqHMxxJBVHz{?M;~3fan3BjOD_~bne9HrvM_4`J`{?Z zxh+hP{L+56$)&MXnp{W0q|0H)7(<82#FJ~Fx8MPhsO!d+YTH`{h*XgP(t!5zEOqF; z${B46)P6lK?dzX;FSB5Gwx!*6{mD!xV{Y2CeA@MM=#;V13wC)yj3w7>W&&>B?K0EV764w~oq!mE$OC+hhp1I3m{ReG`NCO<=6EQs?=&L@UpG zasSObgo=92HXm@XHg#T+)xJ)$4dG*@i=XUW7xn&uo`n~_y$JyI-Y4G(t&B0h!=Te* zEEKd^t%><&`-{FS>v1;2c`=DqDPcsY2s*bhPxSII^Q)l3;D~n8 zH{#`Hvvv#sVmcsR*RPu&e7Pj_IK;;Gbe#@6`<+Pg@X5a}c@E@7ZQ|YC%qcC4W&Sz) zwMj!CKumv^!M1Jwcxy08>aY|qemXD``Lq7eD=p05_Epqi*2;Ma*0(2v$LN8FPUczT zW@grL0@1gJ;aq{1-VlkaS9C4+%pF=y2CWmncZ&`r*2IDkP?*^l*%^0RUea3aAchW~ z*#5BZK}ca1eW~Ipvm^Mq@zu(EZ&^UvP{reizWk4kc_cDhg<(v(?1u)6YGy09vrV?l zx^@g?Brka7y-VI6Zv%Cv#KPyzoF zK1y#c?*7|CVlLV$x$RiqG$$|e(GQl-KjQl#x@jIX`2$JJ15aW_y81v?E>;cjAKf?+ z#Tg;=5i@l;O)LIReKt*uDx57gXsuScn+UZ+0f;)J&VVn)!y{YFth7qlO?dP`eln*5BbJN4><<1GS|05A^ z1up=7Q+sPz#_XF=AkMSe^1FW5Zf-+JWt67X z+fey*6x=1E;#;nN8OTiFv|C69Jks9S87z+iC~)rSy?I{{cY@A)ReMy%=JvW^YcKsJ-NZWOy&%U%|{{f2>0-B?qN3qXVTP z^E;_Q9wD8L)pPD8S#+4TOHCwLVatm-57({xVRgloRx(g*I~%My#(E7hs;(wb*LOwv z8(JU?55dfKO>EyeQB8NZK)Z#JH3B5gMtxI1%KWnpPkOKfrRH#iWE&u_y{&nCH0E=pN$HU0BLSE?hD%M#zgmRcC56uFXMiH04E{VpliYF zLOIG{O{j&fx~Dhx;&{7qHM~$G&nMXj#q^0N$4bBu!=fSS*2FIDUCFZTW`;)L+`iC_ z1iu!}1o1c6(}2Ys6Irv(yRd{ONa6#-<(-?@D60Tcl5TJY647@}SFr)pG(mXH|<=3fnbs5IDfZL?JfG z(Kki#=HSbNd_h41zTwG&LOA}=TTF$g#%sVJN)+U%B$Km zvNq1=O$SBW&#*3jz!xeA!LwKRN^2tCSRlbgud^|+YLFv`n|_z3rmbOqd!Kz%3U5j3 z0E&T?;rR5=(M^oFgYEC#JN6#H;>Awpr#gow9TI!PugV!+2seMyG`tnV>&B+*SL5E2 zedbiXsmcT(Mr}iyhmCri?xv@*na|QKG2$8O6V$lX7tZk-YpW24X%q~urf}@41kBW+h>hk(dm!M9lL zDZBL(J1}1*GS_Q$(g_L}pIE|eM29Q?T`xFR#GA##ZR?YaRA#W-9z{&Q;JsL}fr{IX z=W_6hf(qB=iL;pcAK`h~Nb+1Xd1|ivcb%p1d>4^0>1h1Mu&axBz{3OcM+ZLtZlAJT z`E4b!o^KzYq;)7#nPx#$boLvtjeLy)ijBF*_KPwO-cV7o{q>Tx<018&hrkrfk$pSt z9+wFASSqZ;tMCpDNc9xTi5XjsR^JY@{8&r0;pD>1)$a$I3tN>y1EYuP}Nb)&X`}rht6X|}Z`kBko!g-?>Ct7Rm z?0}6Wiosvu5ZEK?Q%g~6zw>2s_DF0@?gS~q>W-Of@7dMzr>E;EdW2-Q^X#I=2YTpP zy}6MVWk%O89m#ZtVpMRW0?_m#)HY!&nFFF;6+-^++oe_Vdm)gbby{0&2s9`w}_=teP1hvPGCb&d!i@(P`-osR;FhU1J45yV|J_oWYN0Jr@m3;W{Wb+4jzsDN&LQE1fRws-sekfmk zF5%ETRvd_1EO zz?1iw$0uBC%WNwY9+wNR+{T;Jf#LNzgoK<>9(mng%q3(Y9wf+yD>}f9k%T%!J{@+H zIQ}<{1)tlYR}}k_wM$@AizXYX=t4@EdKm@enjDNMuYtUv!72n|@u2fIrPF8`Z)f(4((9FE(3Qg^0bD@;J(FnE zY^3d7_Z>3q`SMbATtW-i4*?6Thhy zG<)!e$Q&k>8@!U30_e@|(8D`(x$s$;VaB&dWpc=US}oY z;R}DF14`itU#4RO`%#RRqHk5!IPIB7+4y^zQahP`bR0!Oh>I>v@f>A4Sd%!jNwiae zMGwn}Kh#gZCiyKHfNUd=?W(J;8$eU-|Ekm1i6B=rKCp-`?#~58QM*W8M63$Q+S4n) z%$B|*y<_6sY6Y*E$-Vz>lR1-S`N;z3(82Ev-mRMN*OQb7q+oVmAI?z=%~r^|GCU+N z+N)LkxG(}f{+g1vi1%WjyI7q;jyQ2W;5SWi z2|5nQ(AHy7&9%qv#itZU<7@5#XL`6EOYB7IU)`rxp zx8&|EB`btE-CXT1R2pe17=>hjgc&R-1@hn7sSC^NojJGLHPY1dHIRMU>3mi-*fiQ< z|6ZNU;NZ*0ck%KDnR0Z0b6GEKZLl@eQDq%oS!Zbf1$Yq-{0Pn}V+P z?yBz|PZ^TE)NG5CE<=fOiom161oa}EO$inq{ke9S`nH0?GqE(@&!S|EDjW^E3WW;a zzyp&pDl0THUo-1emSFqvL4Df`{}8Wu7*eTW5>`7M(~{TH=ZP-K(h_A|DBoZjo0|rk z{%Y7uIXSOpw$r)r_=X@2r(;V?7XPN3F{OTUiP$?g5W8p@+Uwh4Fxk#jAb#j@MsB?m z(T(b8my&eecBGmqfmwKLo%KOb_>dHaL;E;=R*$HRXF>E08R{q$7knYjlIGCyZzw~a zb#Qu|!fBNxa1-Jj`88LK_^WsE5PkA_y0KDP=q8@uPKlT!Jw&gJ(yjb9(LEl#-F`6{ zQ#G15H}CJ#09HK>we~k_CJyoI9ak3O!ZU`RC6ZMh8sqH86)Id zJ|}#293}}A8~p2RX6TyGi*-F4&`vAtB9_tQJ ziIaH$LBL_d;Wsr)?W={PG3Jh>C11WAyKRvZoP^}-wp593@f204ge&D{v64$-?hh%0 z=krHaWWzrKJq@722(53~qZCIVq`yVfs1{&aGL$sXG6n8 zZZ_lnh;&r_WTF(Rz@C;$seX_=8yD%_+@-%hZ27>JSy=|hV%lDPSezW8+F>kGk^p%L zs|?ZZnI~UAY(#h=ZLg*!_u)pTV~%dBL9ys553at(`wpxaFThGE=N=Q}!$n|`chbuD zhhDohF{2dQ@K(MpDIZ~$cnA?L;pJw|p}z5rebH{<{zIU7i7V~JPZ-?KSjKC}YtddZ zXx3wW#Pc*>^QvDkygFw6=EvD#mG~>|6SdQoRZ zI2^`~Z7c^F6=$zL&$xCvIXF$60zrlQuILvz`NL>Zf?7@+I5Z?76=w%$6An`z-wFzH zDC>wipNQp&#+>G!zW1&LHN83fUBb|jhji-;c)|202jdP1=b?N*-T;k{Qn)4Z5|}b= zj>7op;>6GYoG%$lN#v;%+hZCVH=Z9ZwR%bC@Jh}IeOs{;0}6O)-|4qRBHiWUgI0+Y z)(AS>6MtXAS^t zlL9krV!fgx1nC9ZyL8bAZy^Zdp6UVFNiMBdwfR+meF5cA4gh+~RU<)(APo*Nl2FTH z$ir>?qj&=bRUxeY!{|Cm(-7+r z`zw-`^!~6uUw#{}pXu{>Cpo3_%rVi5qNn=7i@e^hXM?`b_(6%o7BHK86`Q%PbOB#COg7llMJ#J#Jd3 z7_!jQ_wHphpW=9{#rK)orE}K(>Q)95<-Q3~YcS0-kO$ho%bX7KF1(iCIqn6=!(ZX7 zGn`}=<>rH7Zh?}Ji7=C2q^nZ}Q@h0^g<(=$RSG2?*>Mq3rH%t#zhs!wq*j;~mq<}1 zLT0tB;*{5fVhZNNz|d_0y)<`cF1RWYFwC^mnM^Dou$r+RKQPxwNG_TxS>wR8o{9PD zrf^K~Q;up!x*^z_o4TP#{07dp#i3VXef_pzjgE$lN`oubJD2693K#MS_8PvpZW~mQ z{L~ETI=6Q{g+Et*l@~JWkY3ZCB`YT_DFf_cxyi-_?Ix#dp~K$>a;*hH!9)3hR_Be* zeV1Ggo78)t{-ReBrLEr&i>O03%pO4E?NcC0N%F?x zk@~w{K@wOq+{Epjr}vsEPpJ;cG{<@&U4p+zn#RpQPa~L+vyj`S*W26EK9p%_ciN*4 zvB)-vWB1-jzs#f;rTp`#mi2PG_5bW zNVN-a`mZ#ss0-F?bbFJBYb%B(7OfjX)YsS!S+J=1oAwpxq*s*Lx(6& z{ra0hxvZFVDMAso?*6dy@p|vt?Th(yj{ziFZb6HEZfZP zG?#=`CE?RVgn`f#KcGpNP%R9L3BqQEFk)Zd*b{`KC5LavwKZCE#D0clP$=9%V(+$c z;u=BS4-+c((x+(#eIUB)d3i&=En?bO!c}`!_JjBL@qlifSP?<+w5=anm-_JOm|R)_ zfG9sWG%)cEkA;(ukNu z>Xy)PnngE>2;x{YoubX)F zP#zN5Yn(@of=dNCVum}+#e*72_a@^h`gnyVi3uS(eYfM;rA||3medTGM#Rw0mMLZY zGOk<-qg=N>GP$UqzVDMcgjgzGyw zGy>7WJcd2ZF+GsI0u1DBEX7Y=KtlO~G9Xv#~k6hg2aKBar+1)%aMh5H z^DUvw@(MEQTe;z2!s``5vgK-Gm*aI;GD~UnkAR4Rf%c$`>nCLCserKZqQG8Gz~dXC zF_McW-1Z&7__sX=8J)E7@Tx*5rvMb}`|rSkA&H3Sr=Qx!rBAG)1tk$EZ0@oF4OF=ONe6hPAAC@3iOXrQ^Ws1T8w>txP*&4}tObSIXS4I3>rx&QY?( znad&0$6phz*4!R^UD~5rF8xKoW#qMMf!u=dex&n$vE-{wf!F0Aw2?+ zf926>_@(<@!E^!5A2B`8an0E*&0K%W_j!poz>20d4AKuCTpP`PJDmYqFZ)lkN4~6mIE9MQ>$W8A{2Cuk)N6XD z;dONAXA*p-slM=%t-?YordSH~1rIzlWDbfm7Gg2~H!A<~8!41zSQonDU{rFq<2goq z>GoK^d|g<%mP2{q-!{B8j|c4x#B6E$LA7WszsYo_Hm9%zM49x0}p`b9wx0WP6?%HFpxh-X?N39!ALcvB|@}xo7Ox4=f{3^3oq|MQ!XOgfo{D z7GvG4w^I=vD+zRL=oI~}Gjar)9%hAbvqpE^jDm>^kAqfpqFJR{^P{y2T>dfPbvS*^f+a** z&YcDp?F-|g@oAac?6N?DA)oTA)|c}l)%SP5q!b5k3x*~nm$&hGC$SEp4{dQY&RfZt zf*gwW-1ZN8Phma;hj;43^DL@VS&`U7q?NqNpJ1)v{+_^Itu=8#)_cls2CuyI+S|#- z!iwX*Fk}>`b<fg?swg2uLE7cxz zbru2;_iL*6-KHO|Gx+)T%M8L5R&S!jkq+I71*BK-4C?aHvjDAwre@jOH9=~viGK{J zpU@&WjU+~>VDBKo`ag%U8mc`qlZmQW7n*dbKlnz6GNeWk$w;bsD z9@YI3<1i`9rIjTM>3;y}^qmcLPd5Y?(q5@+UOHO^gcDX!JLF_nr(76V=MCTfepoU* z3IOQm*o_HTQYu6||Kn$G5$#wJ}4TzVi&X z9c(G15YxY~U(4R-3Co=L^f%#4GwE&Yrjv)w!fua%t%*@fnt`~tJr7HR6sHjTD-SyE z1fpG9w|D}YPT{@)$-kjdhdLQbSVl1P$qla6Q-rm9F~|-5%=7wi15~S6?P23)-g2b} z4a8PG|KbycRO?r&2=RKZe{Y6K8Jny z;LehZg(4`pylOjVtg%Ah&+Z^>iKqDRON)D#cSD+6E_uhCIK$z&xU4G=89!lZf7{|9 zPoKnvy7KA7Os~y`s$wpMfN^AxNNqC?gvX3aD7t=(tXFv?8Qgz;w?T!#@Pp)7Bintx zh(!6PE*+11V@vr7ILvU&>71eVh)=FWL71tnJN-i36Bc%toFqTnclrHj5L95-xryjk z`#$9!ZoHr1->m9;nVSp-_g_}$aY4TIw?#|FEiv!;T(#qP!1|8ocIwB@hKSyR<|>if zTPAgXVs`bOJXN#gX4?sFJEC!9IDPrf!uBkyq$A%jKH03w8LZOXNps@g{*a42v8%82 zw|#Hr9&XZ7H{X3k2F)-#unD%|CmzE_58vT;Hx*|tv|f9mt-!zvKt?&on-ePXUP`0s zCrOsmMAvQG`R9z3U=Q=T!@o!Kg;9$5JA3Gb2U^)IomZ~NrykM5SirL6v$-u1PU`JdX zo06wPvmZ-)8kKU>{}K!qxR zQuMV0Cg;!Xo`>T8RIGTMF)(jG6f97{7F*9Ioc8&fK?6m+jpE$WwWi7EQ$7CLan!mW zebeGhp(1YJM}sH6%id;FlfTxvgOTi0Thc>mv=n@<90Q5}VLqa=ReV%OaW3>(C@n@a zKlUx}>b(UHbJoZ1#sny@f7sbkZz6VzSFAVhZ^HN=D#1P0lx_q7pS}r!H4*HvDe#14DWn=U>7pGG40L9pT)I819sxl?RI{+4*FB0YM47J1Z%78p!@v7 zV?4IZrHZ%pgMXH5X3cUBC3)XR(To5=B|p#g&5<4KJd`datRwRd^bFQ#ebN9hFF_(p0Iy&?(z8l=N;ib$7!c#OnH1|vI z(nrhRtG~y0V1$C2Or5GG<{vgmS56w{%%C`2U7Ebw!L%qO?}qVA(na7vQZGS}AIc(tTQ|>_g(GCCJyFQ?@4~4&o{z=}e(V(C zG9gcw<3Hi%q~1o5lBq5@0!cmQG^=jGIX~j-*W2+PlI-4+twCQrWK7>8RekLnn(4-Q z>H>~06!wdYpm!8o)1Y%V)u?$(FH1@e8EBm6-&_WxM9XFJ(=Ud1xF)sav=JQ!OwBe%b@0QOWGGgn;u5?zeOS0 zh$t^qpd8kVVYiyN@p6|4{<4(a9C@4K6|F9tsXo5^%0!A7=$=P)NEFOaIg&3^!Dc3i z1~SrJeby0RIn8_sy3JQ5vD!?P>XWyb`O8prxf;{~ERovqyydKAuGTU_b>ZK^#I0)k z*2R0f^Ru9w9sS#hhL$M`lS<$~?YpnX0?&w`NX<3=^+WK{;JhHNl4Gct3P9zb#2Mlb ztP$*|1VsC3Qs4n)FLX`qsjuI#ilo=JG3q@>qb4e~^!pQg8!{69r0A&ERFL324~zNH z+~v8k+Z@nKNc4#G%0fl$Ua{Si*J^uY8$Dp;y)C5k4%5NL7_-DdWki3|^HdqLTG&d; zlERtba(-SGV@RCEwJ2XYa=>Q{A@EiW2q4DxZ%)DJ``t@o-UjkPMl&>F=rE}qjg)ya z&AYFzDj1e9?)dnRx;!?2lT``VrgU=g>T(0}17qbsE?ILHVhPYS`ba+Q3no_S;;=}e zLlu3w*t+Q~E=4ZC$SL`YhI#2s%c+u$pUT)Z8{0^0Hd{L}tPZ8+2-C~ESenp330iIv(ed2$opXNDjtQ_^qt^~WZ zC4}6qldb+q^abf8DxgwkeJT8K*q!+ORbT1*7H#*0jWz1hEO*=u45I~PM#7@#M2%WE zMj#0(;p*tSP-ezJW|2Lsq36lhs)ckh>w78tdZJf3SVr|X%+{**0mlz#9dta|MnVmnr7?RwgSr=2lSg-C5Zp`9tq8joRW9(q_e&Kb9_NG^4cur5p*@TBJniZAB+P!lr$`Q@Y6-}L2G12@?S5xB6tGFQsSS(suF zfO43J*~Nk6EKbM8_auU#u@LXufdaF}-^4>z?&eowgpZ6SnW@d;ADpeIiLmg@d?7p4 zR?}UHDpHw^CX#EvX!OrxDj&%a5|oM*xvqir%jkoVHN7!HHglDj$oNRebvF%Tpvg3J z8o-{(G!g>y#x3NvSw?0B4|Y7c$G9z>=BSs>pkdjgy!Lj(;{ODxOn-1Z@VIasC_{`` z=SX*o0tcc&!7KY_o_s%$BJCR%MhD3E?xf$&mY@57A7J!T?Gj9>}URzBLBd*5*xJ1Crna4EL3oe2tXfXKTQ*1!XU*U#~ve&%o zr-5CTghgQpjrHln37Kjss^GKf@z$CZ=gG z9bu#T%Yr+RZ}kAg4*eqj^IOR6N%uQAj2E)SA_<-vqOKi|1{OfsjN765Eo$uE} z35WdN4(IX4w*of#7WrB<@E8OgszCHF<{CPJt8c*wKk9jiSnu<#nhDuRX4bbNxr1)j z3ZD;`!_+(BiPxe{(3b8c^Ir7NL%3(wf7)qYXe>=%GUhIS?Z6lPT^jG&)^z<-nVvV7 zFAAoYOBZMV%2mrqD~*PH+*jm_bm)49)Ze7-K^@f^!=~)G9AmQD&&_D9Jn{ZA3trc& zkxbQ}cP8|%WRjMK97Q9W~-qFa~!kt3)1JZFa& zK)hYFNE7U-qw15`S*YDS)NtIPa6+konJR+e7U)U3D#|R&^CeUEh0ai--nVzvlPz$Y z0hyL?E8z|?=Wat1qByr#%0u5jSVdg%G%{(j7A~Cv%iPE}J|{JiR{7(f!WGIDifvNs zzw7XXWms$g+}2+J(phTnygPFv`&qI<_d?K8(=wR4oyXN3bx)>i9q`U$JJg$bv zzX_IlkVLIf_uC>fXh(&$&TN)~znh4*4X1zu=@k#qepIIW$uXH^_TFQwxW4*rR6t@W z`Q>jCyO**qN|U#z;fb<8PB~J1u&lRuQ#00ZGyE{qlLrpG4iT2r135wOE$%k6PA>pC z5030;0RlEj)uIsbSu!BfN(e$dH7Ztx+YS^2=LaWb1lJyFu<}q^H^Q)y%D0n2|ORqZ#l51Fn4({lXVY^ zq8ny&YI9E*!hw*7C8{!WG5j?oxE-JB8vAx<;MJnZ*(uvI_glhN5_?5>+R`*4<9Cob z%QLegR3&sVZGWDmLd2%Fc#h8O_rvbLF6Da78iA$;(rTVzDs+kI;ft?oDatAdpolOE z${zPgYwbzw1R3*3pNjZI<1+;Z=mBk&cekc?>m_HfjhI`kqNsdRy!5~Ldw;x^w~8#E z^0+{>g)V_{kL zF<-6jS9b*QutVtSBb}Gz!=GCnUj3AS@YHn3Y%>NxgoJ91P zxgv5gMe64A;7M{Gb!oOtggtzy+6uSAwiBkJ99mht>77Sy_%bhkDs~V0g?$N^C6S`? zXZNaFvnzMbNXAf2b}3~<{Q#Z42dv@fD=9=Lt|Rff)6c`>a(k`0eZ| z@(_rFMhed(^PTR1(&ZX&mtgh7lNn{O@Vr;eQ3*bhWfsFMAxHPWN>clqe|lrzQ{;HX z+#LLMNmJP6P&lO7awLE*yxL>@%j8jtv5?zqLXanqgtD8*3WQcOXZpHM=uvOUxvNuZI6Sq+pLaW zKXchvaMzPe)9LUW;N5QFU6JS~0YrX;tftuW2Wc9F&skU4G8$L1g-yrUtQSI3Al$Pn zu;`6p8yPIY?F!XT)|!MLJ0jocVw?`Cy9|hMu6bS5>-LZYDi-IwC+mI!iAc$9uB-5KG$P}jm7t(YbJGKXVTmfryFu znsD+sC~nECjUne7-x}$=ii~$@uC`0@`ZbGjgDGE;Qw}a>0xrW3I)?oHp(Uf5vBcW( zl1`~{p@*zGS|8t)74qG`%Ndl$*B-v~a_GsFQRBq$!sO>Z&sOh|<7DsSNjt|1RW>GY zE9kGCIGw4}Z*K1iJ8N^B-E25M2(LEbybN)4C`?jaAU_}1FrYfeSdnO_1v81-X-LKnR$|h(kyIqX zARYy?+&Za{{0!6UUwP;|>ift2w7IzD!4j!+X2wC!aRFjeLt;-}OXwoz%a86wajrO@ z4YZMs0(h;LndQSVDu(7JgXd9vVJ`mIK|cYTHsTX$8LIAgUe-SuyF>i@&iPMu?{lB? zVh7V!-4y--wy;O&3|0uAboWhtg|yHXXqu#d-e{?6nRyRA7gBpB4}l+K4?pJ`C;#R! zC1t&XW#_&qF}uv@dIpzv|a|BR+*R9%)1m|taWb53S zm>1t089Yikf3Th)C$hTm%ld^|J@>lsxk+(JrHT=4Tdh%1wew?!mYHH8{g@u}UxDNgmE{rN4oQArUn>jf!H49i;gqjJ~X)&{C|IA2A*EvpyYrvx)Q&+U#GZargRHj;Jg&agGypY=A+2PxH zqnp~jpwmL};dnJCip=gdX`>IR%K9mVH&w~0zo;F{HM052$1vin@yfN$6SIDqR_!lb zbA7r=e+d2j%zp*guY>aB;CB5=diHOBv`}|tn;V2OG8%S}w%ceU*BWmBY4qZoaNp&x zM&W}&Dc{MZps!6vqWok1V~voU9XD641g(=1vaXOmR#`{Rki;dI5@pVT)oL29Fk8S zaLo4ZJ(0PH6AM2nO`$^91 zmUAB+5`CEkNOv=g5O)eC)mz-dSB{Eo!&ij&jDkv-S=sSWNh#rn7-F+U$_P117B_Ma z0xL13ssVM=2d?%sIGqbT!Jpy1wRjq@*svRe=MMkpbw!>o}c*mma(QmUi`h$floGu+1a*t6HMzP3vl= zm~h4>l219d8&|J>4@T1!i!;#B{q;q8meJ=^DI^;LR_LiMlRJVm*_T}bT4kCc{mSiY zo{C$qI`7HRCLED~%4kqYjXPFPw4i(X(=yR`x?+9vxE(j}?PvLQA{EToiOdkxS^L^^ zg*qpp7W^-|u>HB#(xhojjCS-ld;oy*n_>;?AM3!&qN!I;ER}LPoEEgz2jQCvHnQrO z0@BRzdSS6Zq2Vg3t*X&+fN!GSluTNH&K}9xy-Q_~`?2Vz?)(mT2hn(l>4Ohpqp2!? zyVTTw?(N*YnB;rVL83vOvfEC>FP<;Hu{wp}@PyA2&#!2N6h#0NLD+9IFFXdf_R9L@ zj~om&;=W~oeC1FR(NGd+w5`4W z!_BYsBe(x!DO}JsB}^Q@3zm8fTN%0jb~bAH!sm`n$afyX5ZROT{9Y^F8o1)5H45jZ zYCL>>wKr+b=u728cp?*6v19Y1wzq}5%V!h!215z zBB28HzyI?|4UpVZSn>a!;>HU3_7D{;SVby2e}g5@6av`wD+b+v#`-@O`M>$e{Q=Pa zgd4=;fK<(K;ZY458Agn$4l!t+DfAOHIwnD7G>MEE!s zl!^Xtgzb<3i13fhMTh@~pYVVCnJMWW%IPsc;zRzeTLMtp4x5=A#vsY%-&qP?!D=kIOyl^ zf4l+BV0t_2eF6iz9n+fmFb*s6&2s;8RB(RFrIVh9fzn}`z}ZtM=9Bj|xbxF>{!Q81 zJAvQ81v&%XVrZ3|mpJOQ$w9kQ$4mHv&S{HFih&V(zXqH`Hya|f86oj@tg!K>Cq+- z=Y%vmxoI4gG6>(xq(B(CV-*8+vFZ_ zvSt9Ei&2%Mw#O&G;?jJt&jPcfU9A9$$@%^6L=JA<)$w#y#?_{E=aE~to{97L>vlZh ze&DA-9-l5ADLw#NzMCkEj2J=OecJP(V3qhuFI(2>uf69RWf^uglUBd+N3SW-fv3ea z_r+m>)NhXmPdF>&Yq}qJBJ!1fA8pzDieGdIH)}bW-&~)xSl{ES*3^=m&#mfZPC=v> z)5Q6I18GpAH$YkJVJ<+dF_>WLu?GAq+FWnO%X|bcfJ4IQD?pE&uPYdNf*|_7{L%Ka zTbK_)%(vO=cgAO8Gu zzRs{ESo#}svCMnRw(}rQ65Q?5>N53T7is*AoIDbIHM+#dBL!zHU|s`rr`*5t--KU{6vqP!=_UTV8$5hQc+ zkp*#bzNbC?hKUpC_d%{VgijW1m^}5^#weo7#LniW(at$BXhMlQyF^p%+y9}q|Y4XD+8 z6exG-_znF{!&c4fo2!#SV6zaq7@vO9xB>%k?5i5ET+AmL8Iq6X=$sZ~6l;zxLL}F6 zSWtTUx@N>{>nLnf=6e+Z%vZN6ht8kxdKNWLx=$! z)Cp!kfN&=Fx@j{?0ru%r^&?pRR{Q;+p2`(DAxU5+VugAhEQ_o8DR0=*w*qKF@iM@v zi-QU8cw|TU@HQL)R0kS!ggP!1Q%O6%Hy3d^)c`>khK-H?&c$Ye)!O!lTzI0M)JLM( z=w9E!rsF8)8ybx(@o%knF-79^fxijG0B`l(jSyTz?~(TKO#ok~3=IKhUn5gY4ZRC4 zty^se-oaApk!bmf)w?iUoRoZda3(6LZ#!jc-@bdcVS4hljYqbQ9Bu?GT3!@Cq70a` zd`Cj9y4}R*3OrU`1tPO=U-vLXkunf$B3YN45A(PMt@`P!A3_PC{>{sRWfgrkqKs&W zE7o&W@Xn-|lajUYzBTbVTdFCB*eO{2?nCV!o4f4<_6MpKzR7eP?3lGb&BM(F?#)fs zJ;<_BQB)|MR2 zZtXYx>`_|!Fj)aK_={L8>_hFDqOCDb&paVFH1G&9G8jym2B_>S!tHyVS=h*6luOAexCv_wYojv@-?s0eN7p_qx#@d;9eahcka3=>o90t6 zmSVK*Sr48X6AP}FcbR%wpl3r)D(S|qsPO1$1e+yQ(@B<2GoLWynG@7gcqaM?>=x#- zO4Z;6VsBqa8aBX6yAFT!)uuZ6$2b+8G?oNh(JQzh^_@x$FgvNu+lY5}lWYMVC~vm9 zSry#w>X7ZJ>DXtAa9+I8Vm9&G>r|i&Xy}Yy>ZJM#xvK@s&6M)}xdE4DNTv7F#`VY0 z*7F5UGijyK;bVOedB%1YY3~4%yDt(QlB7dJWvlz61L@{7{qfoPRDQBgY2D*2m*!Y# zJP3%fc%&KNe#7%l834?Ej~YtyO1j#QfsYa^Uk0CLmVCixWqA+nr{y>4Rn=%vNm;5ya9 zHr?m-Ynt>eOI-FZMa@mXOHGpBS3Y#?HbD}k$tQuR?HOa8PAQQOFDvfwD%)|QHr9)w+Ea2Z*y20V!l89HWdMaO&tlU`8UKs50 zkGt0{c$ffzrw6z^-s2XPU_luoSt^C-K1iBaLUkr16?vD$^9mR2s8q0_cX(aN9Rr>| zCb%m#=xt|f3_sM+(S=_-3$3|@G1iOmLts=3B%}?>17@PWb`w` zdp9x(x%5DN9!e6Z7w=arIf{c!-L%by3|Ie4smtAVik-draBkDHX9*K8xEm-cIi&(< zYph3)vzzHjxefGO?_mV>J*pIw1vo5g+CdYjuRAiym zoHB>Q#5rrM(r+uue%Po#)rv`*I(MYnp3cV$>&f;q1w}+0IdYoY@Eo=m0g0jpb7s82 zbEyEI>Z~TkzPVm7gLqDxZ7>p;PtiXIuQc>krm>thbiq9M8ZY(M&P*>$hOu!l4;^bM zMW~Kdk1Y0eT*hUtNP4{su>PsQcF!%TAasJQsfs95b@|;=u}6v`<3bHaV!#qfuT=IJ~QKxNp;H+KNq&iso@3g+tbE48bNJn?uLt3esGj zO@CsSU8szx$0;6&hpoL5H9)Ig;gv|HJ8PJ|dh_mq#-VY58q{D)`1=m%ZqsrVs0V%t z>eA7h8M3|kbbUX*tsHB}KEL#IS-7SC?eeCJ3EUL*f9`n?0Shog z54WdsdM)RK7+jjxv6$4SRw;HhBaMfZIV!A2$?rOUIuR=vP9T!#!qo-aJ+;K#39!M1 zgTB;B-mDrO=zbf8y$%z)t4IlS%&$i2G(rQhLsXDOK%2=>|Cz0CixM1}M!W?ole;FQ znr>s0@&l>C`(CJIjFi~PQpXQd2*c~1+KX;ND0=5>6~jRD3y^}!Zr`1$q_pO?^Llw{mtQ; z<_oM(@eEPMTVZEdb(n~6Wc1&+Lp4=cz1)OMS@~_o3btoSy}im1Qd&H2=IBeJA9qWM z2DVIgJoJYjR&DuAYM5SnD){iO>(sctkSE6JFACE1-L5OmQRUHa#MMBsu+3d8cq&m; z@;uOiPI?tt9jZ`N+IW>!bWs8?SJ(BjjR;(XJ?w0M1=lmpa}pSq-OHh2aDrs@u%?2( z4YY+9C(<>^!;sGl4U&(2FJx%hERx!`0YQ;jvM^;QP?RS(1w%ePR_+RaPut|bfa`4s zjb>nzBRh%>oZS}4prza7Ov{&Zi6U>^L*nIFey{5R5ZOdin5eD+#qh?Nn9o~upR0}M zwX#~H_?7~%%Vcl9Pu(m`4XhreGxzN^CDi^(oKlZlT4T>3%p{n6a-%fXmGTYbyE=zw z$afTSVHN_}_WeCRWdzwRN;eiQSD@2oJo-I)yi77c$x}qvordj1CIxX;v>`Sh1rEYr zA}6!HzI_adDY10-aEOVLo}r=%E72mX&h~SCK2JI%^vzw)?z#AZFWD8vpq)2BuVQge z2R8AANyQOvJX{>BB`Nu%iN^j3=^`oz>pK{zvG{a@I*z1RI7JEURcCKQ-IY~=KU#}- z+g{CW*baqt>M%qc&eN|f@1u!kWNi9b*bvunhIRZ&2o0_r1iz3GmHJW6o6kUSsT!@F zY_7x??k_s2b5(0>ESCFy2(@0%1AAArwo|^he`i6jSeh`j6rQ3hXxaK}dGp0KmQ=;d z`#;zMi6c1f)d!3WqDL9QYi2K(h3!TInDAsr;1T6M2W?FbQIPrM=j~L3m!0D7e=ZjC z+s=W}oN50>Ta&y$Uzi8{WTw3&jxJmP&O3-}!H{Ft#PjN2*-Lm9!X2Vvk-QT{J1*vo zgY&v$#`d{yLPU=kFej)4OwA)B@1yZ>Eop>3f#j*#Txl(kU8}$KgN$FPjc;gwHVauk z+x6MQ=bInI)dX=dh*bay5M3;ZK$r)7pjo<0oc2Mx+je>t`uufO$L{33-<(4!yB(S1Lbv1mRllEdB)?H8o0xTX0$2DYQpJGOfOOXc3WTYIam{3K9Fvicc z#QWtn888~Tggk%!J4kx5E0j4<5K%v_q7x^9$e&x{0tXL>sWn!dJMZiF^Nh^CZO03w z3-hP&wm3-IO^LSW_MGf9;DXCQ+BJ2vs+Stu@x@DilCRjDO-(c>pe)ilC2?&bOiAcT zy)(&Ax}dS0^_J`9{Bn8@!Wc}AP(2RkJda6b4R-75Z|{~6C>IK(JsJFTzwD5M-;&7X zd)qg&%K*AT@)auF2#9OUYJ?FB_g<51VXtTB#;6QYr56`v^Dtg)HjZk1Fl`DjZ8fNYMw5Q zux1{63ddsBZWE#0<;;k4?(Gr9UF->EN&mUu12iJ@22}AqJvOOc$c~BbLiaf2!07QX z;VuJ*gDib68(k^L3@%qKaYrFMI`8_pYAx-(7O0FG8mpkTIeL^VW*D&4mbZx3wfFmz z_i{LrS2&j_zB`a9yeFzgl{Sp%u(@7QKk zq=9g(c5OaLBS}sBX@5mLB_uy-CMl4z5PcG#VDI2++oGH zse){y_m7JS_O=YpAFXOQ{Af?c^d#{1K+mDv`U#9SCo4xrE5p=nA79wr392xiJw!W9-zs2tIWJUIlV=A*I7c#h6e?C$vgj`F zGdxtCR9tCamTn^%yV6rBXNtH^6lb4$>vJ(T8qNQwq@VUP5mT z%g}Fz*sX}b$#U-u+K3dl4^UO|><>?Ve2t?n)mDhj?QH+{kEoejM4SAKm9)&{>zziL z6}ds4{Wt!2LaZ)F2X8Jx4Z@bf8S=Z6m76;N#<2K+$j_a;v2v7FxabH77bzX!-(+9D z=TQt1G`&2?j3L`0=;S_$=!kX%372f(R3oWFIwO*+`5rFklcgX_QZ;7h{rN#APNJnq za`+WwfDkcfWpmK2y6lY9p`4YlrTk{K+UeFQuHE@oKQE`N7P|%FH}X@fc^)y;d+|Mz z<~Y_q$v$=Gz1N795Q7^KsqC<)t{obrrF;$UefFbZ8)jUyCpp^n-jkZ**Z1h9J~sF& z8U5&N3V~sd%SiQ^;ibbfT3R$8OQsx&*NbWTw{oGDd-#)=eh>kkgbo)9*xj6KyJo3v z%Ju~4KJN9I5vwMy+894?Zsg!?I&*Ap$_!c@W+eO#OmOgLo8r0C7fBJc>Wa0CVWHM~ z0?b0bRvCN%4%xf+EseRXlYCXDqC$Imu4oZw z2+Kvh|H7^9XgT+-4~uom7ZUL{V0XTKp18By35QxIpJP*>)_g!@%zm{I)OKEG#LGs# z+|mi>9C4BEq|Lv#?o@|sn6_#Ij54GZ(T1a+y6`-(aeKh8{O(YigTu`GnKPcDsp$6T z(V1d>#C&GhDG_6H^zlsfuS9oUUFE7#-euk{-hfh`oglHzM!I6;FYUB_;Ue#VXC3bY z3E3kFpJ{2L^HNx;@ED(4TVi0cTjL_SM4jJE)__V}nF40g6`6X-k!p^P zvj*9kWlx(;2P(g*u5uxD+ z)iLcZ!82LNnwOKDx2HIZKYx8?l#TczpZbva&Xxl58Psx0qIU1>;%j<EK9ih#j{FePk%Mn;aqDKJc2Yi!Wj_!G4sbX|m}R={_p*dbLG6?0 zk)B!?L;|_(SbU^JiyyR1eB=!pBd?gzX)4B)!=>2K8^!EM@6fOv?3T0-;2J{gG~es> zrYt!aC-}%GoOm3cRbiyhT5Y(Q@a?yD;NqwHqh*B=zKI?7mf)BnNDzv}+@U!5_@W`O zpW%1WPEcU_JoSEG4|ITOEUyW3lS&a_Va@LL0sIP4K(Oau?gx6Q`W4uLgeQ?5FZMgT zZmtg;DUHGXdeZOSBZEygeoJ?Mddz)qVhpT2DY-)(ZDHaSAxha4-MFCrIu#HV*Q=xel=vz->H<~$@^Nk{`11lYrS zJ>49=!A+TV)!^5a=?QhUJ` z+Bq)|+VR%ZU2Xix0P^lrn*$uhl{-=`?~twd7R)%(%f<>tV!DRu)vsC121{o5_3R2< z($59Yu$(q%)tbdW@;d$kUY}`P{a5uGO zm%Q8WPNcZ~JzAl;uDh4jXNF!@k#C+GXU|`YGN^E5po|I&;ZifFxwwX2 zRGW%C{Ra$CTP{Y8d$^*(BeQU=-teyzBNAo|G;lzEMIB7k((mrKAEHi|^_nPv35mExF0}^)w-wzzl??QIL^*Za0 z1;lZe9&rF0K==8~Y>&mEKGyPqX!pg!OC3afuMerF6X!QW%oQQTTiBK!^^^WRk{k#_ zE0BKJWTQ46*KIM2Ml7dw+6aC1_!M>Y67Lust{(Sl+9AcE?e^hPRgja4wCHF8ch@$d5E8h{cEZIpufw!Xr3no7o{x+R$dL3f%QNZ#AdMm^)7Z=}2DI`!G z!U-i$a^L`8g|_So<~ko@Q>6VK}<8#&i~MKt@JQ zXPkC@IK(I>gS`LveMmP;eI;9u@TbG8pK(5j=>xB0{0`#+)F!%lbWf2 zp>OaB`Dj}bjG>~7NrH+Y!Qrj6+c88k`&;M$hAhSV{Wy!dduXxjU8j&(X1p zsUK!6~o4-ArgEx3{{C51z zfFbjq6+HkwFS6h&$bm1iM&?Y~=UW>~vw+MgMCIdfa+@jtTOTf=mr ze7Q3ix^evI$kRvF9(wWH$$dIBV@e$)8?&FVQE?$=|JBDC4ImT!F@d826SwEa0=9dd z;>ap%$mPiDHxy{+k3O#|Bxf@)!8w5|`gj!(=jLJsr-|iH3sF>lB^vPohto534_?a` z9hL5)UUeA!mYp%rBrTj+d9XU$Cdoey>->gVKE>k-A?HG>p=%h7$Z2-vTpp;n9Zxor z0bP&7yzkBMu#Bj}N)J^oPeK-ach8C-sJ!&L1EGaD zL~9dXpJ^dtcUX7D=JA_T3%fL}TdXZw#=O1x zp|lZ})6k~FAkLc^z1~EEveC7f#@1pLo7S-}dzmm(bhti5obX0%{QHym6QmfuPHS|a zY`Rq2`EUpqiIk|FFf3zFnWD0h+*x?XS@tB$#_A0*X+REHvxRxc3H$Zqtp19aEQCJ{ z_VC0m%PHA$fONgd)TmqP$+zW|jd;wQmyoo{@KcX&akQh@phz0d+o?vH>L={g7pk|l zj*O@9SsxQlW$W0Pb{0_wd8ghiJFtJ77GZ*qZokCCS0ua)Ktgz7wi&nX9W-_{b!BI`)wj^o1xd9Y^kC zXk_Z#*zD?0-2aFYlP#sW2m{7Bd$ta=S-zN1rjbRy)zW|E@q^u2P1rkR!WUnxG8os; zE=pet>;)g0hr;Rls4P&$V?DRWRE2?Pk%!qsL%Af1-H*uzz?PRAtmsrEeRw9k@G+yW zCu(H>Y@gv1NW?DPO(|eVchXfdJwLZIw@GLPCv-FTnW?xY&

x6C^NjF2J3m(Je<2+;640kKUYK6*|wB&(X9( zc+|DKqnGYGn-Y=k43a*nxCCS)M}E6Kf&?_e75hTGK?9r&iu~$cxNmBx*g?gr?VgG| z?R6I7N2Xw>;&tY^+lpx&@PUhDA&$?Imy&45EjKQ)k$bvWV6|_5WYMKeWP{1%CSCW4 ztSfF1>V+yHl*X7f8S8gnN4<}Kd^4?W_kBVfj>A+vyjeA9+CRKTW56kR<0W|B^$9*( ziQUPL@pAcEUg)Qpe_M{Ab3uTFMF)8{kq4Ws|0MKObDcf6q-j7>c0#+09=`t~9V>TT ziK9)M_!afuRi3&3H6HvHtRj$=d*jBzp-t`$DBUSY8axZ}6f!Vkf_kyyU+=V_<~wvM zJ@V-)KfV?zO+@YeN6xk^*6d%>kQx5R@bvfbqJ`JPtBwWKx=i572gWrhEN9M|5m|6< zfe{a1?>~}USMlVvH1vR-6V68E6Lffe_K!u(2$|aEOXQ4Bo{s5gbb3czH-nnhv$*JC zrRP9fPpS4iG!*#yWxUsyH-BgVGP$_q%+}Bk7=dqfMLaD?8B`(IbAkJQqIA&)A{UN* zP)i5Xq8w>Ozca4>xU?X4IZJQ4h>1HG7~H674O(v>`~?OCl-Be9+{QD!zV?I(gK(4JrnGM~!oqn#>C9HY+jj3SPxGIhRdepA z!@5s#@vVBym4xTgQG+yo_Mu^*;z`+U2XD7%^;P3u8r-JJ7phXh;^T_<|@zai#{t7P!GBPrsG4j3o zkujKVu`{}jS6rFay&ZZwZE@BWmSa>RugwHWE_XdoyKx|ug113(7r=YRI6%hl`+c_2c#nI@UJ!7Wl0+%w~$f1+I z8dOSEm$i`zSN#@30CnVMHQPw@xXwZ0`*GytvsrcxqruN{b@SI2wxiHT+p zbjV9NZ}p1|Ax4I_1qmfo!E)HDHb&nkx6U;zYb%IBucR1K2-gX2x+7wpi~H+nTi!a6 zDAPoVChYX>xQoOrtDR>mVGM+QMm%rNr=(`;mCLJ&T&}juS3=xYuO2nkkQ_27XNK-4 z1dO|=%#U{|%cC|X;@@4k8RP0foMjryYZqS6>DN_2R(Q28$I1D~gJ<7-OW*T-|LxFG zMxuhZSaV`mpqG86IVuXYD)HEE47hV@sGHf3dI3I)Y~Z;=H@!eW|F~^cXGE16PhS;{m`g=(WIDwhppCA3GZ5GVQ6qY$d_5iNO z-${h6oR@xhZP#(#?Jg&cIu!FED=W{Vr2VwHX4Xfmzea9%y@g_uv7jw0OV?q9l>r@x zuqW!OMc3w_tpEs(_BTb4%0Ezru-k3u79@kX!Pt$uJQ&2t;g(<_w84QFhMj+;yU7S_ z0$g8#wi5eHVZ%;ygXX6LwInxGoCamwfx9mJw20!8%_gc0Aar9D+}PV-v2!0?n*rm%1-)iTh>$wOEgI^p_s)%K0v-^3|eLr3i9n@0hAS-&T(lEmk6oz zxfUpw$Z({a?<28r+nFILz|q~|LiEQI`%mFVsjy$c+Y0q+lhR)ANHe`W$8z-ab_Y^q zW(OwT@u1rnX|f<9Zsq25H~JiJ&)-*XYLj^87Ve80Q4#npcgv5(O4o~kc}2yU5dDnc_k89x{bIGKX@m8>EUf(TPTq z&U-F-rE1t*43Sl6CBn-)f8LK6_pd9X;Qh|NR63>3KcB+;WxE~(=d0gqW6dcbIGsYO zSqRr@;Bq3$61(5-C@>OJwSK?;#KFaHc~nf$`eHWUJPS|W^lf$igLXsW;CHq;A4et6 z)}MmrdwfzPIp7pS-wRtl6?Fg`IY-H%DUsX?^#Wc*FG5p^PdluQ_phUrHE1tHV5i3| z;xB9M{WFQP$X62Sv#cLETYwd|A5EF7&QSgU%;@mX0nW{ssDSqFSA|BIO#FsI%O1ab z2lr73Mv2RPs2Y*%BNbG3t+t!M<(YBWFo$R8?;%`=Nl6MZ?pC|>+?t&>-Cx6d4_nEj zQ~|X<1TI8Wbr_*S;xC5LCvv0E@{dE)5=8N-q#KJ?qYE+a0EB=S>Ozb9V1)Vi;{RevZi5ed`s!s@LX^s&fg<`lzt)N6*Ni&DW*;(F$ZNud2cS8VSK z{j3`dYAs~yll^_B`OoE_%Tz}qJ_AX?Q$<=o9h2Hm%!RBvO}i8vplm*tOk&2zM&$9~ zvxR~gl>X0(tIBU-Wbez-{QEPxa6a~w@}##k5WU^l6AZM*pFHZ+K)|L@AlXIT9fxNx zya=W+u`MT;d;JXIl#L>4PaUQr+y5T^%FPyU(@cxtCSLuzmquA;R*=(i1xnQ4J8h^V!!bViOwO%^Vq)U( zNri>js_ND6G+%_pV8NSJLYp*DaTQQeNA|3_p&z{%riEk;(Z>+POtZ z6;vZr<;Lc)UKjr>k#wRzes#YnMX-%i9wf%A;da`6ZzCwd!mazxd7E;sG3&u&!@-$y zt;q~QW6NUlLs^3+ielqs3EM-mQu7Ug?S^s=(dr>VEuk9NdcA_$sO7V!|y6#Wdq- z8|`49$V+&}-tHy{G#|nrpO7BmZa{hA4proL9YC5hv0$?Jc>TyDV&*i2nBf=eEQTZQ z?&Elh6p!8qvs8Pj%4LZ=c7*P0OtW!Y21rvC%Eor$5kd{Lo*m?hsv^*arEhkmZWB5DLa#kQM|L02lTXp$IdLS*= zVT=c5zJDv>m(D@y_el$g|CW9~4CwdOVsyp+XOYqWzp}`yKp(goAD{F$p7ndIF-D%z zL*rNZS)+f4F_O=8K*fHfE_T?zjidKi0x}@>|C5CK*C@z`E^H9{BHB!>e<$|QQCLOh zCxL$(_^54y1;qY>Qhes$SPy9uRP6g@kNjJ31$=@{DvDiUcuOC~#-C*H@;!`a03{=mJd3 z)Z4r~pJ*{PX7SFg!H@k0EUi-ZtSeX2J^z}dc$HedNYxBZ^Ge47(E=N(ci)JgaK_j* zpDj-U=s4p$Xn>LoRN#S6f8#P?;yO7vvSc34l>mnD_~czE)-TkyT-l+7&M+s5(>H`u z1Aj3rz#bh21W_8@n+^kP1*x&WfqY2e44?v!xs9C1pH`tJWg?x;#QY$`11JweRBf^n z)D;9hf!$5`^&lWG3?<}{-_*l+XRhqMH;gD={ycd8rhX~R5v5jt4W@n24x6l1UJ-vf zkbL_cSZYY3CZIlR)IiOf`v)MD{v0*ww9KuOgR)HcqG^y~s}VJ5wE>_M;bLL-G)zQW zedBsYTscj>2W!mi#SKK1TRv07V>zfgUlmwbF3-7p&kYV4O!s3GEx`oPr+;Z~Eig8! zVPecBY07BM)9MF>sVJNBe}Hht#`@ClwWNQ*!_xjKAyC; zR-|^2UyFEy;-G-ItQaQ?5$yufi$j$^U{s>%F-p%Wg8FWKKTJKhOyh?3;!P!mw&iSaREIPR2tC@dmN;#JxoJs#u`mSTN0F)6;(S+9n6!&HsnFt5#% z@+?|Zjp7)|)u*m%Y z3`J`~EhH?vwrR^QaDqR52R-}%~879i}JMua)^^~&l+#=wFoLv&M?5AtdVc6u ze+60!17rqfAO7Ol-hQmGcSdnrJW!~XAx$WJ!k)Ah;AO@fI`FO+x$QSquFP*|ffZr~ zvEehu5!qPntWCxkR`I6EeXLLf8L^6jF1tLD%2BjzF+Kg1lB`m=QVfp!5Iv%DcE@YQ zyKL?2p!Y!krS)>l@4=mOuWip|slH_XS;uA03FkCNz@W5{^5gNU+{;R}M(5Iz3LN>u zMG{mn`x4D$S{-$g@gK(K=j4|8T@za{I+uOlEZ^&*vKHCkH))bb?^6DQ!sL#@-O`Z3 zH6#kxa6vVGd+_+`7Bvir7?QU?EYtN{%aN%xn=;bMK91AK;EfZmv)(UEv3djMz!=WW zfD#2i3Z4o-Fg^fhmh%tyE7a-zp$2_F=(S-Dpf99ssNmZvvS{ws7Jm)+O}?&2b7jt; zHy=p(<fgx&>VlJm`eR1NV#-^mGm{t4qP$97`XhhB&K4&cSLmj*40Z(6F!|PJakzMxM`;D^4UpZSrb((BAH>)! zo&8GUUAY_(-!V=f#A~Y>7=1AKIIQpBtZ}1iU_((D?aTul>*q4{m7aR<&uog5jVlaI z-qo;W^rAajXx`On?<}4sr=NwK8V+VXI)-t5#R$cM$4N_WIT=E=1~U)JkDCrD z7gUqC2Zh9Oyq6_m7hAR5ND*Y^8FLdwrB`Q~XSjmsS>!3$U8$O&n16W=Xrjfv>d7X8 z$A%4X(d6d)`DO44o}9n-c)~qFH0$Lws=A&0V`5UW2z)7FPFCb!1qh%nDi^bXV{# z!3#Zz3shd=(k0wa`?$|zh4qnCsKU(){7=q>bdB#c4Y?xFm7qxVW~(2ZcV9LCP;r-v zw5+h_qWz*}>WK19%g7?}CjsU1$0{{!4C?^V)e||0@jLZz)^hf{U1yD*Ola@@pZuZ%8INVfVCHAvjvgqw~=~(HB8j{;a&tlWhp1C@vFVQFi zf<*tjXxSZl9u+jm=MuOW$mPPXb41&EqXm(O)f>Yf-JQz$p_$w=zA`tpDkZWyQA$-_tL+P4u;HaSUjwa4`cfxsVriqLe^IBs z|LYGQ!YD75a0=V#pN~5NPeLec`m3sn9KQ2o+k(>E7sdP21Ypc|rvv3)fZCm&bY<#% zg-aVw+&j+iK)#c>L%|-ghG=^qY13SHP$JO3(wA{xU~mdlfRN?z-+;p5J@``QZTI8L zwPH%+hYE%aE4u{xj1{6;0F?D$N@-oF;Q{dv{JF@p59G31)-Nq4Dwt*qW~4IIOR?gD zUa0QB#I$x-CwVYS>7HGr^^^G#nd|=jr6BLPYm?JsOx}R5nU~s;Tg?~SFb{LiW4jgv zT_2=Ejp7Q-lO9Z%Gk<$Be?LGVzd-qQ2c^59-oQlDJ*0NuldSo%?Y!$Mj|dOB5rW&) z6eR~Qw}NHlBtcO>{95Pf<8*G>=lQ%yyGRyX4OL~+F+}ee==7NJC^&&HW3Q;qK4P%-39f@x~*hl}` zlf+<=Mps-axmZ#)@KKN ze)Ihj(mLKVaEcn6I)eO_Jdwva?sqT|9kW~8Pr+sxhxt&Jgzl}zS?x2k{@vIze?I@) z9nZ2$6?1ahjeF1_fJnx{!kT7R~Hh| zoPZCy5?RQ|f*RIr*_Uy`yBN4soOApK?$hmxY8^ZIL7cPE;^(V5)}2k*w#{>zk^rJb zyj&d2vN_D5l1;eGmw`}G{1AGSPzG8*Qpd)W1*4Bli2@w(3;8W$n7bGR?HPd+${J1s z7$i{`G!u5y-h#GinT6)&Irb zTSis+w(H)4h;+xKg(*mPHzE^|4(V?fP3EebzgCwzvDO%UAQSNrGsDlKaIxQjxnpN{*79TX?G+- zlr)@MH*K@$C!kf6P{$m0IE0VO9^$`CQWq~xVQS=V@_vJOjZEaXxp4f= z8<<$-Unoe_iq39wm8W9T<*)FaFVNbdb8mK#b$JV910$>Xo?SsUD&>16>H^W$hNjX; zrwSCJ2kVi2L5G{IXBAXme^JBuItWuG#|G_(pgqw-$6bGn_ix=?TG!xgy%0kRaf(9% z@1SDI9=_VeX8qQPn66EuWk8*#u3p3wmV-~`aip%^ zFmLPe$7~VqZ*!x#1?^5JQqv`wlBf!$7t{lzX2!RFtt^~rS;e~F8tq4SlwcdQt3YdE zYHe31+tGYFq;F;#EqtF#*!e2w*g!QM|K^)_&L>=AFJ&K`6p;Q&oFs4j7I7{u^@s6N z&CWcZFm!o%p}2?AdY;$T26V?gjGA7q@~$XbMh9VW+F<~0jn`Y_=eR@=`V1c7s)q^! z9yP+YC&2;XQ3ZAmZ)@-LW0B*)2=Pd-FPeB6xceb&Z~5nNr(z6g-7pUm%d)P8?68z$ z1`4|7!Gejbc(r*$=dbb~V22TqKXqM` z@3xx!@QyJWMyun!;O85Xr4g6rmg0!D-}Vs=2ylIv@5v)uHR*X99!SaqHE+|oAL>;4 zcfe{Tj@Gs{%aI%*^bwgC?7T+P$_F$+jLp@99I1yD`D4g1vzEdjQ)|mhF2_{~Oli2= zue#;T-m&T!>;5h-TQkU_#orH_ z@(pHH`w97|Y=L;X0i`zl+46(|EbPI%vdOXxE{|4zmSxP$$7RSmauNqCgR1(>!y?#K zlR-AAF%-97bLz~0jWjpyglLBFY@hIr+*AEIJq9ZkUn29tRn0WhzBPX5?!QWShJoVr zX3H%e-ITKc?FN-Z`MkmSO(}w|KVH{NRry+797Ts+$gTk*jt@5(L9CAZ=fGdX?Tt!t zz=85XD~)za(i;O(d5S_Ugcd}|x<%)p!$=u*6h`u>LIp%M#r#G?J}Sphqh?}c!x@yW zxbk-ezfI?RE#1@zAB`DWfujR;djaM~0{T8P9ya-z{0Mp@ep}=d(jDseU;M*r7be97 z^Mu~5JTwUrNo~Cg&25>cs@24sYYoiMV+h};85-~Gh|_++Ie4&F)m)Xvlz7>>C6KI{ z-yIYtv&$9Z@8`9+<{_KKrn3F;yrHuxT)2VK-Qx#cc@ItC%|L&dpGQ?jkk@1#vapTO zlug9J`aB_LnrmYSu}0@M_g+;vxa}mQ+K=;1jcq;QC|}|Psl{`#mA1p3E@}Ef(PGW& zh=N-2hx|V6j+xqjGite2%>#81{RqzAg{P&a? zG2|XN>B8iE&c{kX1>5IK<=D0-+jsk$m5(dmVGM8UYEr^VgO%HsGBUoEeZ3O=Y_}cG zDXI4>%<;QVk!upxE;Z^wl|W^Sy9d)X#YU2egug^3D4HGm{*Y#m8&~>Ke1eT@Wqc@B z!Fi2E-KD^&+>d#1HfORY&D|JxpL&ByILqHTGHQ6r@O@ts649wtR|rAT$N$VKb=en%}gR zBDvX(G()IcMlvv&Rk+bGn#<*@<-+GZAl{KE^d)k)&GLxD2_g=WO^h&tSR7MxR>tA*;klFoo)DyG2?4`$VZuI+oO?0 zJ=&2xNq!l~d{O*Px`*(N=tjJ}9XAL_*v?IPzxs=9)z9NbN*ByXQYvJPHLLcf=CWsC zvW=8YVZ;>&Wl)=WtcD%w?Q7!*K_MLUUV*ThGpoZXoW2=OYz|D0VIK;*hcND*P{(am zD(ADF#^s=dvMm2|&v4E)p~|)5BT-OOEU50JcJSu8k|}==CUEu>_Rsr`i#Tzr0@6K` z@2pX9v&%7N6sgmP@G(nzTWaDY_T_l;zNNo*3lCj;)&g3BqdY^j*G}$i1_8Up-iw#W z*xq~0q}kKPFFyCaWotBDw+gQ)^S5=Bj0xwI*_GB~0b`9<(vaH1Ss6PDvpKdjv*ibK zoOUUD^|=uFz}=(=f#O)pmCx%5F8Y@lBJPt83dhc-Yz9)(N&bAegneFWqa4Csa|&~a zgfFd2U1{Z9vgupT?#o|>jXH%3s%MR`8qAl z$&YzQKNJH(!#S;Z9hQI4!yLwa`hHlCA&udl`CC9dXWYUV-LRp#JI4^iDEUs?EdEx) zdSWWsTRzp?yc9&@8bFedSZWD^wv>JO_w$u!n*_h=Q^Yw6*l1y9yv^Z{c1^6E{OiMG zONRVV%pj$BD}4KGrRFS(IZOC4tq;XupE8l?TytF|i41xQNEV_g=}PE*jc+Wu?v1&_ zyb}?m|9qw>S43L3C@B{`@p>>rGI8etFZ`Ow&z-im;(Kos6O}X?tj5TYPF(xEcnT(R zbASIa4|};(%-2OlVrNHMas7+AOW}O4JZf9xzJ7bR?+1nZ8&`zS)Pn;Bn`bCZY26oX z+Ux!>*bZd;@+gbWZlJLW=e)$rCy3me;Nhi-5Np(VgAYts)kC)Bc|RD^DYwaMP+hH4 zd*q>Tv?V7maTzjsij-(J9G`?QrB>pt!7nJ#hz)*WgQez%vrVoaTW^cj z-)dUuR0%y3UB6JjejJkQnRqS0D#8PPk(f^Jx#K(VmnHA(SoPI^w5ls@LKv1}kq`!dF3tf~1qzmLpuY8l|! zjD1pD^Ev9Je1D+-=Y_Wd&3#u|a0s~&<;}R?9eLJxx}y8fgPI|BG^#p3k8{r@e$w|Q zp363y((jGrjLu`}kM`d31hYw}kn~CuwJ=LZIWcKAD;&q=%E1Twtr1!sY2FdhbM;ZL z>3lNmLoxj&s`r0j_}>;M88Qn~hX@`*cdww6mAo9iRygD<6yH`|5>hlpC=S5xoP(wW>k-`-0h{<9+HqQ+yo$S=*g zua7!;9;;W?S8Goe@g(8s^eg1?pzHkv%~r~ZqfqNV8pH9MSD4$R)&yX(re*3cZw3(2+CSMX`}feo+KwGY)mx(17RV@4Vti zIysRcxHEYn|IVJ)GKoY7l0a{U$!9Yk03kkfuHE7fBb^lDRy$7d1djjmsb_*JYVEsI zEVM;d*JW_Qz|I%2e$^-gG4-}`{wF@Y4QDu~IwFXl&@)?w=j=<$t9)uldBeQ@Qo9S& zz6!LTzKC)HFL5fW7q*0!R%t1!l)+UOopIY1YB}sRjaWX2!8>N;;iE#2e$Zj>!Z0%L zQaM^rrJp=Z`8MJcxBk=|rXQfww$gX$VD_ca*56>Enq|CU|4FN}C%4PMoRZ^0Oe&AV zkjfK_Ni2iEis2Q?D(RhT)|;x}iOjx03iQp9zKpG_<(xhP?@{ex$zT86MI@v1z8L5+ zz)htaU8(`E*WGs1*L6mM+MGaygIU02fBY3@tp&N#qOI|^g48|E91IIcR^b#O8vr?Y(*6=l06EGaaok@8$PB% z_BZFe4H{3?L6S})^Wc;6wEasK%=DL6eblTITNgW(?B&czINjg4GIKS zvK?NuI?&nkGeer4Bjas2By#yCdg85nNHqGY4{V3J~X=ai--C zSWys3Xnreeqg6EfLcn(wvFC0Dj3-M6fnJyge82xp5x-IcFDVcK(LPpZtciy{2yLw;vFAACkl3}mlL4QMPx|$h0r?oblyIGK$U}bON3kgWR?TKOAIppTg z0nRjw6%Mc8sU-OW=^j`XN8To%uaXC$+{cjYlO2$FOitP82r5h(gzm@MBzHdfVb4yo z2*}v^EjA^zL|>0L%qGcEsgRg%Qm&dapG!>&)||oLq2btjwJ07Z&2Huc3;d_!ItAH7 zg{AYG_rrsvY4-`$9|)3Ze=_>qbsBV+fb~L0FRTw6A#!|?NnBFeM%rHv+sCK(Tbd9- zO$D<;{s;B0O706$8;|UWe>B|`4NkYMsH`ptDTw{rq160cFpz&E+q%(;_#FP)oRz&? zo+;^MQGz=$u%39wexk4Vd!K{tzSwBtIPPL2n6Q)Nt;MY-!5r}=G4)zYW?fc^nt#0y zcrl3XnQUK+(uI-@U8r|ls)Le}*HfKaO@dxLLm)8CV?^d`hwwhX;E%tO4=^8$|E9?>0ITmr|jy@dX$mGCCZ!e{v4<>+L{ z`9`5i)KvE%4MvWLSsfrLx)rUNP6OIbFJ8wF1pFyCA31KYWq17=LovV3@b}zs_OSJM zi3xxLE zb55>=tj&S%Y;)AB`rk#*^k#`OhEG1Pt7=i|H^@g`OaT)Lu_O)K^p!a+ zzWa0udW>0{qfRM221UjtEn1HhECw(t{wkT1-Xz+HV^_4_UMl>3-}c~(6whaC>r>#g zB0R77#N4ggZ;7Mqpl~cpLYBgp1j)=|ieyAG?_Dx}zUz~!(clzEGuS91KLcJrQ`V`B zK9ZuOh^Yqd+$wl9@dI;-AL5X zF8omP?p*M$c$sPvoGd_qJz(vCHaiFSC2V``|A)2k4rBindQMNAKkoUa&>GdNtk`d=b zB0L+t038~kikVYR27P})`Sg%bF!IiSntcq@V)y_1Nc!lV!j?Pm9L*xl5|Q?kr=a|4 zmSQMv#5q*_KGRlEZsKo+y;M0b4yqdsj?c20sFQ-E2*a~vm)1WshSCMl%U_OCX>pN* z$Bvj$+dAAEA5y$G{CNZ1%om-uzzr6{(r8f7fRb*>G)fUdpt&*B0whxzeL8k`djblI&57x zIvMvx6x}>Wb+bcb<)#04r7a`3wiR`LK7C?A0*;3XZHPagAMxcx=?I~DP>U)TXC=8s zBHDurlUzyHWN%`&yY zA)#$M7>=4&#j^jjP0_J7ruxbI@C)9`If9*aj8%#TTiOJpkzKkiKUo5by5A4bwYug@ z+y%amuKRy(-};YyD0pRToAQjG>{E34cxfkVyvC<~`~h}GxO>I4m{q)nZ1 z+fb2#Ghuj>M_{@S*&Tdqy!Z~%@tVdhy}(rkmnBiwr&E)us7ljMRLS8|5`U)lh6zA?8jZXkl9X3Z-jR>>XYisfq3#09Rovx1Zq5an&I1?{5AY5a@>vF)4 z@!Db-#BBn=hRb?H{K>FX#e@Mf$^L(ZPArC{$fUE9{?b%71g&;H>4 zxnr8>9K_Tman@{t8{8w=$ zT9<)uP%@WMJ*%->FL@nG6sluT*y6((UV*ZTofq56N&>dVGDF<$*T=sA!zzDTq9|2% zzKBY&-)CjqHUxj8@KJ|X`^(j?wN4GNkFK?sT&HSC$Xf-lT_CS@GdM!n;!<#yv?@|Y z20fBp76ajWbvhbSLd_@!9OXXUyMx-P&STOzYhHs&-55LE_*QzKFrovsyIsp{1GVwm zN+9*7@FGLstNdhkn$|DLVr(5FWS6}M6|VOERvsmcWf=fyxiR{yx&Bk)ims9e9cQnR z6N>sKZnJEYTb6`=bEfydFB1kHhWzTnS4S4E|MPFNN_# z{3(?E^wCmtUO_X;&FeH=q=nJ>snOtBm+bi*RWFtMFiPDa`^Au|Vnr!(VIJ$+ueR~;ukpB88gdUkUUAcL55O~(y z+nRj;A7mR?Ar*rSE5&$KpO)ezLokXPS6Gsd90T!D0t>NhC_*QT4%9;CQ4;JMVAjX6 z%U1PfDswqm3i!%vEa}nf8=4kC=0`zkC@I_j_-mNfyVI>9v(ws*I#;kM|K|(|!2W7m zl%repo|67=AF%B=+k5!fC$Nvo-UgtnR$7=ucUB~AFKAZI`4(x@lnPn{jFG@q1 z-HZt1jrs(tJ5@oC_-CoT{I6SLLjbo2Cg|OrUmO##ODKw z(uo1PI=N-|`@w|%7N-EI7x&wiXE5?}nZj*h+zTaDcd)^5pl+;+Cha&3?fmeL7StTV z_(fVy&)4I|2*kJZ4qske-Q)3=d)1*&5oPl?-61$D&hBLUkrHgg%fuB9f^!L%GSEA+>sp(YQgg(mK|MXAw8!4)Hpd==- z=(|^$wB>2_3d^F&cgT1-5sce6Phq=&mevsWNnnr$yLqz0C>^wxZ3Uh|6nXRuQ0ex& zBy_GyKfqY@JIyByZ~#t-0lmbEv%9s6mP?kzQl=LrywG&W{^+}mqZ<%(QGCp`Uo zk^8r_*<{P_K@iC;-GR3vvGx+{_iV4_&k(i^hO6{3f0iq7-ZY`@a!>b=pO81w7D-FU z=(}Nzpj~t2I?X-F+9E47Xo}F%XS2lk0Sd!8;!?-!^_g0}Mpx+*6!5twFgNFFH3ooJ zihHQ)pELzeTUJ!yMlEIgdGDqr+_%1)iaBEPTh}rW#d5@$+TK}kRLzS+E&{%}4s`(V z{)wtB!CP7;pK{oO`A#9<m4H6Dvw8&Fa$aDk0uT06$}$3hQz)+gyNpv-~~ zwU!@v5M;pDl){SEL5E-R{ZSThB^%#)LKXYA6W0fCQ>4&6UiZ<=O5x254A6Srk?#6) z50!}nPY{FmJ?51ipGa1Hgx1CdlPKVoofnC;cxm_h1swf*+$Vu>TGCkS3Snyso$9dV z@i_ktX!OvLgZz;M*IRzv`#cy$cmg!%u3?`?N@dg-yG!a`lJzYvpEf@3RD*bDTMvj> zHin|28H)JuLa^P7R4y6Eg>I@^PZ!`5LRe2l5eBC_ack3dwH*VFo(3{C`m6z@9d}*< zgMZ4-Kmg>9@Y?tsg;yE)QFgahzLO2Kz$p8>>_%%_f4{Je$WCA!%Uz@CB2W}P?O+@a zA8=CpN|RHxcT2WPs#UL*I6UYorgl6TO^Rz59Pr-^ed@zwNHs$emdH#f6kXndl%M}3 zI=%~qs-qU+#CdA(}cHIMFCbiyaK&^7jl!acly|AGl{ac*jxh) z2hIk)u0ai^u!%)@#CsOWjXv|wqZGDXh%7+#dpr!f3-x?9XjtcR2ymJQ_Bvm5*?6~2 zGSO7%b}%(P-(jhWp(We1(jO^V??)tO`ThP(m9vI5<@eHD^&i0k{m(N{!o(RXm!qsL z)*2%7@OI$>o0l3zq%T~Z@Id2{9Y_#LpUPL0f=A(PLg|}Y_DVv}3phnB=}=9_!7 zxyG`RXA`9H~XBuNhY)0dbt;&M|12 zOrj*pH4CPobg5Nr4c1Y$Mcu@iuMdvjo4iuix~1S*yUjOQ0^v2$Q257V+|$|_9H=+Z z!VkV`YrGKQD7C^`9NpHNy^KJ!gg(|R1F4XF(lFMpOSj?IjOmi-6(PO8PPxH28 zv7t{O%(4Clek+;Z=xue!PaE3Fh zspUuUqgh7Dll6XgrLUmvPy6%RtG)SG{VP1n3}QrzthP?b$JnoW#xBoHNKX#e#8~lf zPMXqo8n;zZo7UdeTL!nE^pgD*A>BUkAn&bO;i$98pIL6vo(R#nIWnPq?J)ZZw2-?v zRt6nZoez4DLcpL|+Ncey)!O=~KmOf&fi5h<8D>648NMaDLMkQm4v&384zQt`-`;Mlo+w^41BSLbu3u0Yp;X(1LUzL} zdo&aV#Ezw6F6XW?`H8G4>7Da6>GN%BwYam2X*(lFBh!-6KC7Adw=uPOUoF&YQScyy zeV6vkM2`BDqg|byaA*g|&RP!wYE%;Y(^AI~0|noUh6%z-Ga&)*G-EISylr_KUmMNu zY!vz`@>PD>_tx{ZXk3k#%h-f&G@acCs3uD<7XGO`l(L^loK-k6sP^mb=3j~Ozou9o zYBFTos==uAr!lQ4qZZIhYWnpfgpfP1mSO;GgzSs!vmc=Q+Y2|#&ebWvH?cd=v)s@leq!~ruPtl)$lLWuUz~q`*kW%o9iRu8GYMswH zcB1S}F;yQ&QOz^{k{7Qv5&7Jrum6&KjyS`6bSTb%_z|FLO`xE=v<+t#@@~PEJlycQ|TD=+a(<2ToGnwfiD<+ zw2+%%K-`2u6}La=Qd;NcY8N){c8yUkIW)|N2UU31-X&RXev&bp40T+s zto5z`983HRPIR_u3*zo2hCblMw4mw0GU{Tzh7kduZ!h;KdD3@JUQN z%X~}DzrK`NlUfgS`6_L8szA%?%M;bMo>&AD&w*W8RIGTmucTTxL9@B&g+T6wD(wDv zj|e}NSUm_XBE!rB2d~n!vT^Z5GNiW<4=B_3RsvsLwB&n&uNY3!`pLG^Q+r!1MCH}xzk=KMWn6XDVLZsyo%NC!MOwSu>IV{_CF?YplLyorf zH5I)V0@FJth2k_N&$6yG4 zG%v{{O@1Zk10Kz|k9zM@GEFpmRDA_%g?=iG39XYP7ke*RRT1Y&V+%Bvk5rJgxYHS(2h;b~`G9 zP^9&aOrc-TxWWp9T!ieUZ~t3RBJzfNMqnW|^2${A{yO5+XrL!sPSaooI9vvc{xPOW z49x!Hc{PJl4Ag%cAsDQDr|!fGtqlZamiH%qQx5h9B9E=c9b5~0AjnP6R&WxxAv$y) zoI)ckre8v(*c90ekK0CR4HC@ETHpHe2&a(u``a@K+VmpUEImli9<+5&g4`}jUsyuh z4kD9cbx~$5SOmYTpkA;1Xm$Xjo zje8lzRVMB5y|pvArI)orGtGArj}^{uXGG1jp_BRLZGso7@9OqNZTYZ}e5YbU`J~Ik zrb6M#G9!;_#!<~b@&a)7*nSZBTq~Cz;?aF}vYeK$&15_97|mf%B+Mzv0^VbwlxZ6y zG?TWeyu{3XYj%tYt!_C_fU#9;zq$GFTVQP%ulyMli>l_i4u)_5C0@_zQjIVk)S%Vk zj-VpiZuImCRm>A9K}RRNPW=A+#vG&Vnfp4oQ=gBbAfxlt4W5M<*B&s{%}7hLNy1?6 z9ewh&2Q^Z&d&_3H4Q#zh3l6kw+S-~@%0$8H!N436%-%X1PC4+ z>IxM^AG%D_pOJ)rphLSHA0F(Yu4H2zVN=uxIhBU*J=6{igLfjZ;O}Igr3)iicWSt> z46=!w%<<$=%7aNg5ee4B8t~>tit6@}_>j4=zSMlE1WM5qgy^hfF3IFkZ*+u@PvI5( z^0J}@KY>7&GscXsm-+1KV=MX`P3_}*B4>;cG=oL-qrGDfKAVXnzAr4}VfA5> z#vKKbzO!V@j4!diBZD{7G#6j`&ASIwld(ggJ8>vt{63|75B{tJp7oF7!1~e`pJBG1 zO3#t)<;iU`IS=*fX|&&J&1X#;oDAEAcd$*u?)1{K26)eKf2g!kPq}S5%_O$i-9=SO zdX-)e4y7HozNqP?=pClM^`AwaS~{9z^eKS3fkrhJTO~+{@&#HW%;b|5(c;4 zdQDa4Fbl}pUcNW~&-8=Vg%(B*A(y+G$!yv;;)bR`zk-49W>d)wlc>S`76JnHUeO0K z{C&wFkCnZHa_c>Zs^a-5HH|)F0CfdnRd;JAjL@U>jg`PBdV@Mk)ZR5`ShG&`vCLov%Cx;gmrtT(gJ8+Dtz`b#W3R;ieVN8#G$O4e~ zi$R**t}TnuAI-Tlp0)nzLlV4~(t;R%v8vf#ziRr!AwRF%fLjLt43*q~yegxk<1Zcr zUcYEq4}1Z#z^cd>#i$mO*4rz8+m8%}Ylbkst)qm#Pe(rj4S06=E+S8kD&(w@r+P#p z*0&GQ2Unotw!$A{?@N$OiDHw&A#SEe-_S(7!vBXOmWB8sFxXCAaZ|YV`RR7iM3DO&QcTi2epBRv+Wbh++%jpJjNtZPcf(U_&Z!e z6&=y6+DwiCDe4%|wRIiICqFYUW&rC z2QH+Hm5kv}i6Z&(c220d>MVoC{*;of4e?q^8JhwcCRXNUH4g~8BH+M`Wfc6#_c(9* z6jNf9PyAk(WMA8~uAqqsp1huDV%U~y^tq#?=J{$7Esj%he%4`^pc?C2a0GzUY`jah z_$Uqn3vr9QA&u?*G_|73EUtMEdkvhj`52^6Sd(&jaa0R{pX#$~NXl2;T)Un%BiFpH z^A2i7z9XLwnilYDy08=v44UAlorT!INP3f zVfHl^DO^{Dr?uWPr_R=w8*boVUbBWN_&3(-)^&~9vuqbPwuflGKP`rhCxnHYi0+K{oM%BCA5C z_oKe9$(jk`yc`6l;(1$15_Y3L(FMfLxUCOLk}I%)d!+B>p>9GmETC6$inr4)`;<$ z&ZCDc6%9j5oDZ}3rC+zk_F|u(KD=SvRb4s#5NJ0j2kU3D%@UHOC#ek`xCL0%3 zWIZg2)IROAIvo(jH$3%EK2Y@u&*vs<&mpC^ zt?=_i$xYwq-!TNvOVRh}QRLFAC*^pC%mr5Wp<_U2uJ>!!XW&x81*y_R#V&ywU8taq8<$U=zYAhR@Aztv)7$$+pRZcqRZ<*i>*`R3JeT5T`9ESdz&nU;b(8CdRra4=0P1ZT zh(P|Z0;wQm4~G{B%;qHUjQJ08Rmg^_`3sj%?jL!g_9{#76;#KhtHT;fs4lu1T98Fp zO@kECkw4nki;1NX9Oj;_LvV(3et4R%K=;(=UffGAPS?DD7OVh!=@+a%yz$diE!R>g zlw9#LOyf{TcFnod>v4uMOYWV`_vyJ6-U>1OSbUP-DT)^)E$lSGq)OvjSo;OxbxwOj5$r~Unk9%Cr0VFv4rhkf zGYqIczHHmwFX9IYy|{U6_|bW)1X~AZ#GjsIkTJiF%RB{uRk&iuSnr_V9glM@52$yB z^_|16$YG$u2aml3-bE_eC45d3x2dp?;^BgEoOD9*l2Ggh3w?-#c-zE||Fvy+2K zQ?f9$R-Jh4&@Bq2n$kP(9f!BioTfj=+GLC3n3?8N-h3t@6AOU&8_+;3b*2eDI++@O z)|x58FO+V?>rc?`*nX&H-BO#tcbmCO{hs9lqyFb^ z0zu3AZYv;h%uqc1S*2GK7*R@ zNJnbvi3KA*`56X4+Ey^z_$R=wgsCxu}c$u0tG>I{Ej7pJ??jXp(2{V zT3BC+IHBJ`4aKqq1D+wTmB(UN7%r}f8e&e)aJXeVGFLls1+men6+(;Ko7~CD&zlwM zlzM&%%0HXkv77jBq#F-tgn{>kY51f~;)k+2b4m@kzu;~>`tSG^v2kcX zI{|4s^F9`+i}+vhJu?&F&nhh@UN{qWy3>9mQdLXMR7mN>(UaV!y42(Q8Y>o~Uh7&F zowk@RI4Ambe(q$B`vA%^XEck*Y5iqP%r9$(LW+I;uND~Bg0ZEpUmx4}%E1PhDteni z>!0E(;DZ-&{O!9A2)1sDqcuF0Ug`mu3-l;W8N&nZd=FhlzLXRp8Qb>7OI|eoPG7wM z>x5=1e=i!oEaJRtk@56pY~kEvOs&H(9>fa36{eIs#FssYhWC;Q7^VpNdJw@+E91H{ z9(hpz2A*PF=k)-0qUZ!?l8bOIGeQWYCC}l3nXpj@O|525;WgsATZ!bgdW#6cXVu46 z2I>sKUh5?UKHFNiCcl8S0jKDDCJnK@=RPdpGK*)Q3P@?u@Kh ziDt4p4)NUTmRuIAAxaF8! zXU25m2Zc3reqjuMGloBp-vr< zd*@#2>D$t+I@h=^tt^4wrlufnp^98nM4n`a*WEd9@N$F}NLnR-d^h+U%GXxe(9k7Z z*c<7Z+D1y^9(yt>x*93*yVLv6s8?IkQb{2T^kVxH)3aES-g}qtAO6UeyqMwTuLfD4 z?p6I_DX)u1Aa~xh>&nd{b53OGh=PepOE{)rv~*Hc%&0T;Y;g}Vz>ERXh^-S|(Rdl( z_+?(MrpH;?64F|kuqhRf7LwxFlm0JpjSMVtRV&0Qlh90m0OKFs0skCngkD%vNn`Dob9T#Bp#yV)t(7t0{yl7Xij!2jj~>9FV0w zd>u&T%O^j?+tdcJbkr?3I_GSm8E^9xZP^whN~ zpP*K&`x@4A48O)R-$jQvLddzsRNPMAv|L`V2;HPDM+!Ywo9OfU2#_S^LM1UpY895a zJGOMtmlaus@C6VAQU`kMpLiE^Ty-c`KUk4am-}bNJH2emllq!GRgcTY;jAmRb)#!| zom&40NPn4Zs?QP22(!Wp7MHo=DtRjyL2P+Nc-MK)f)N`KVPj9c=wz{m~=)-;$6BNOT8--24e&?nsc-^dXlROb()i)UR3 z6BQ6JH(-}=YVPV`M@lA&(1n)IQ7mXf;h3-aHNo zE0&6RC9cmW#}>6^_-pwBWMeHE(%4nEo<>=wx#KC~BI0qu_J|Sr8@Ico$#w?>1%Q@; zrJuFtIpSm&<<8;;L^)KQX=_82V1Wv?axj5J>#egrENDua^m;6H*u{Ns?k_E-JU#yut zTb58youbpv+IrXRnUhg%AZg*na1aS@h;Xym^S;qN11YpN=9vU!_|3E_a%3l|x9w}y zPPQ)srA5aQWSfE!6G|1*{Lhum5-6<{7$#xhb~=04>9-0yOjTF_s{0Wb&wGa99qd`2mz?{ zO3pxHEY%TP4UH(4;Kwxp&`i92FKY1!CCSKo<`E${Dcj!b6CV+@^%GX1{jEKe&_Q{I zOHG25w+VN2sJL}yA`63< zmLBsMH@B;ywnsE&V_1Bdx05+yYKw1x;bgDF1Ni40u7e^6j8(*N1`d3~O-pn*41YP71TS zXxO#Nf30?4kS5N8Uie_VUX@0{1PMRI7~dRkR1$mi=^J^*GI=)kr7{nYNrmj%-MDA| zitCNHB`!<|FCvs>b+K*X=U)J#hI;0XNNyaTnP+m0pOBL(pYY3$0=(sT|J=b}OVKl8 z+Z{>4feV?ZEeH8&CGr5UwNxc~-TOVDR@;X(F{^6_*;D9N4P;fddIWqZ#!O6lcNqly zt58YztUuaLaA8l5G3f(UuSidjDn*@nJ|Qo?=`TY31-DPMkJ1ppdC&*sJd@FnY#b%m zIL3PwNOqQ}c@Ocl`>z{E5Wnvu%zcYX#r2dXe#by9eO&Q>u=kfyQMPaVF0Oz`N=i3U z0+NE#jnaseNVkB1N(loZCEeYf(j^St1JaFvfOHHD-T!Nz@7})`|2O(-zu0T9#ahpL zT(f3wt~u|UE6(F{e0~aXnoSES$sKGxlNPQny|SY?DKE>uGWo;ehLbB8R3_;g>Bu^w z25V1qQ6-1p$JCw@U2t2$bioXsyQSm>|e-?kW^4drm86aQu#7;NN|em zA@#qgE69XF9d)On{fo)qn?wk>3ZBQ{!TtANklPv5QK%>;@jp~PzBd`T3X|8L{zW`u zv;*ph!kzG66bW=FJm4z23O4>l1JOE-M7T~Ksr^IcOQ4y6s|Zpb`6r`E1rey`otGK^ zQ2AdlI>9~lFgwfi??X(7VhXGdlhe@>e}A}u`{Ek`cr44w_Wb?6mO$%P0VPzy;{7id z@&8xIN}}FK8NduOrXC4eYTQ2g?A?ZOwWZwxLnd_Z5QMwRfD^4I+wx>5CwsI8PoG=R z|6bX+c)i!uW$B2uk$!t+=^XC<%hd7nYM|Rx>CXB4|KJg#LzSQxf+&ys8xE3>8XI%q z0|QFkf-U2(6qPS*p2Qt&Cy9X;WiP&)nC7ua$m?4bc_Fqq2JS_SE3mQ|^aRl?7TN}G z=FVfvg699l-~)i-)90T^WON2eD(u{xws_To^L=M5o6+}s^AX3OrI_)uTd+A&9}p)q zAiF>58oXwR2KWUmL8!fFEiyxz(zd)Uhlfnp1G$yGgFFR`DMGU46l|9NoyA?zgRM=- z2|!(~%nl}emeJJIpx`sviT|*Yzv!2w$d{*ud~ybTQ)c`jAVz-97VMmdh+74PBj&vj z(diruM&Kl+FYUO*a-ta*w+HpXvy6=aBoQ%WaJ{8HwUCW1(@I2#2ih#%X8KstQ?8MX zB1p8DI3EzT9XmYXZUAf-1x=Yxn)SWXPXeTW0~=)D1HcyS0BS|K7IZ)lEvw9&1H6FO z{nrXjyy#ITUaC`R_%!q=my2_j&ev(HFrQ>-lku&ey6SIaytOk(UNh%Ud+Gj%P1Q}| z1SIn{7aRh7-w!6y488-)MSw_DfnZzKOF*hbeFstJGf1eOAtWC9hv{WUbxp-%)FIVU z&~rG0T++DA^bU=Aih&b`U>BrbGbrr-uI22{7~^orj;YNG8eA153eF&N@MFA?bbHXr zDU#oCoANI78|k=*lnP+`JX3fPvRvk|${wBRZ0-f}pe?~C=2s#AWd^L3wM8tE)&}sA z#kK%r=2c+s3qR?kbIY>RwXi=*g?q4EA^{Y2nkytjgtRCwyXSFa=!a&9c?r);Q6Fw=^ zUN9nC!Ycu0bts!U$fpG~!hqZt=tuI{JD~aucPfM?qwbimGR41*()T=H^?N`l`t|tI z$Y6cA;G&MnbzVRV#b;dGG_-)mK=IBsB6;y@4QD>EFPrALAWPM#WzKdJPRI<|N5WEe zUlXR?KR-1yexCCoZzMGe2pt67cWn%lg+$B3X+I9|_+_QVo86KRxytQcCh=M6AJ#PTWCS3-5k4i%_OKeSLZR`Pz?Q({ej1$ZPu z_n$Tf8r;*MjTgIP7kW2~lw9a;%z7DE)+pXH+VVy=^E)HmvdFnHz&zlAcT&QE`)hVs z%fM|lt7FjU>ux!U%>De-*DT80ISmdMBDWwLvVyzRxL&bOZ}DQ%7HAw^{ztBWAqM1%Hgx&n z40sB7vb99}?g@Ek6ZnrnPG<5&UI-6SeS-Piwv5G!+DSv7*n>_o?u}(Zx44t#5B;2^ zqGdiEEZkOU8?WMc8Z43ENk~DVV8^s~_O}Z^;I5^nHNSWTZb3;*a-xf?4a~H&PL^(! zZ5?bBa64K$4@=(vd6D-@sIYWXn${JdRqRv>ChANOb=Ktqx=pz|_vvOskboU8-30_B zcf6erNASw2KcM#C5Hz8cD^UwWY+H4`UGcwY5?ytD6JSWpo(0`VvA#`P+f z7aLTmgvkGtm8RY`Qk@YBXpu=@i}MD5Kj8{1a$5K*ADsA9i}3Gqpy;B_$?T!qhh z>E|FTUEdid2yT<300$B2cElAxoX4>;WY$bk=_)R?1b_tFgCf_~3ARGePVN$yhT@9I z{uTF4f(Rw#6OsWjAXE8~roy0LhoA}-h0c-ZnmY}10QoC?S6ilezdMnOheuR%K`5wVQzTnt4v3v62W7#=_;d1rZJG;a$Hk^>~8(( zf_uJ7_|+Nmu5Tyro*g1PC+oWct?wCD;-=qfDuW|oYvTStym%omzVyr%Ng9DJz;bd;{MnD({R{A4LS*^&gA8*PT#ca9*1&!r1 zv+0n#8>qwa+*S7(&!9a=jAyp!KL0ScTG4j)2XBgu`K%Kj&M zA@vq9BvQ%o0t9+5-BC7rfXd~<8F`}zkAyrEeC3~1;fiF}V1o_~1`{p2Guwj)_+-`1 z6sjD@K#4!WF24PADx};tpoF(7lgX~0!!$;Fer9E|> zl%b{vmZ^Fvo_}C|Q5vNQ1s=!04y&L7w5wWf-&MlGT9J+KdBLma9-?ux7=IK36In!- zFst@{#=6yY*n7HgS1K>YaEDX2H%Tam7eC>JsKVM=Qu>NV1NAcVP3z1AJi+jPigum*imn!!G_ z9`?LyaD8k)=Gav1dR=MnjBnec^lg&c8eh8Z!tMX;3!XZV6!w6%l;l6Cz(o+^E@twM z`AQ2>_|4MD1TL>j0{cH21eK1_*+t*Tp)~GosOK+PI+rZAL;8{e)e!xn!3~(0pjNDP ze7FUao%fu^jb5-be*I8oujKH8#{*O+A#M@E&_R%X??cc_{aA6qLmNB_^v;D{a1w}grjGhTEzHlV1KZ)J z$So|lw3D(NQSVp$F5VtWJ`u8$sj*B3b zdw0hOG%SwU&PQcrLDRiS^GkdG5)W7u3SsJQ3=~kKPuQcoY8bCU{`_~dR?pak<*)u~ zR%F$eYG`W<%zOae;>v>QH9o1yjUH(F;o^ZHFLa-yS{GG=y#-yXeCOljp+3Lg?DaJf z$P=p4^2A7-_j*U!4@CGO3RRi`2N!Q`pE@oit+nUGBb}Yl{YK2Ad5-|-COWLvF1E|K zM&qs}bGa7sT+Q^DSZP9EHjBw1Mr%blBq(b3AEl^cERrs=xQQL(_AIx#D2sf=yPn?6 zq-JbB-^fe+W#ecyv?2W4<06(9uBPQrO@1pfY6Lt-oMrYVUzj=1ZVn6OJk(aj6xc`O zKhfkU*+dJ@#-*E^$wOB{UqkH`Jd$3ox{Z?LlWl!XsNj9*IpN4oRFp>9*aBP(Jg^4A zlLru*%Vp9Q9Z&6Zey+X@>x?m-A<Re0b0uaI=bY@`zx3Y&R%&H9PPEb3|PwkE{w z#y4&J;YFU2Y}J2$X{38PX-~VLXk7f-5Z=)lh?sZ^L5%2_pH~MdN$mK-P({{hl5=m!qvJ*jVNag9Hb>x^#b30vp5q^#9Z@bSu zZdRl6HAg|*ie2WX)Q{4hzh${Tt2>H@atQTTT$0}-(dd4YQY<&ALxp&%z9y39Zz@*Z z?BUWF_iPorVd%%ggNzbHHfMc!J@>o&e(tQu>3QFyhwq#5t=vLsH@gF@4nVy4a&Jf= zl_09(K!j|{7DYBI9f!vpI6VM=5GGSj*U~(MauSQ|S<fzWqb(nX$%lgIh>eOr0V;wRviZFj!8A{=&|=U;-&1ug$W zcFC+ht~*O+S^MYm_49Pqd@bLXGgQ$Y*U7r8!b=C=BUwz$Bm72y>ETqssZ7*mv~KMf z3$CN9NqXynm21-Z5GJ}+l<-G`Gv*xao6u^A%^ur8OC#ga`X9uGbVsXz8gGEz}{m4Sok9*XXTCE;CLlXj~`rX z54*;tNi1v?6Ec^GSF7}sc?sR%lOD!r2q&vQvrV49U#NpYivT0minaooq|wr_Tg9~w z6&u-(hpi%`um!(qXIyAV$b3MX3G&FR-Cv*8&NaZ;w}mI+&0CW+wYTYIW!eT+VA|v> zIZ(Z{G)B0R-bG)F8+%V5fggD~_Np`O{-wib9}nC_avZvf?On1(+iA73;*@WAi^9gf z-A@#eN%FmE9kmLX3JK8+3#?tj{12b;20iBR`u^58mJJdb4ALnhYn;V?El(3DBtu7P zuW-^u{{vW>>0Lm8z);XTqR9pNw)Zz+CHHF0TDUuo;M2K~q8t{9q|g<8i7 zkBPj!tlQ}%JijDCR&(A3h&arw}BHN9Y z(5zo8m>%_26~QW*PcIR4Pu~>-H;09V1>sn{0EW5HwdhMhl%oTvWT3P6L_E*Dbaa01 zXuwIXU+UH&p_o&>K8y4xqU8e{2M@lmXX8Dc+xP|YO$9f{{MF*PX#VPk${p@QQ_n8E z*|p-%zr!`M*)g3oZr+^y5KC!e&;7rc6terC`JW~r`70oCUI(wK?snP0I$Nhg#?r1r ztyc%&G9A98I&V;exXIeQm#N|E;un5lf>WP)%#%cu-G7A3QOi+i$T+5{vl;B93HWnp zn6J}mO^Pynw1W`+e6gf z=1m_lR;tR#D({b>yUU_^&&zaBkr90Y!0cRefGh*_wc;aSFS;O zFzZ_nJ2BU>{i@C5guDG|iXzbJ)!mL4yOgw8iwXH((xZY-k4EF<6XKv;Lv8zQw2auH z_YT@4%to2;+=jTCC5d;Se2h3(LSL7JjCyFjJjiJuQz58eQyvxPx*AJ<)Y-3B&EV~m z%}7zqGkli3z2)okWOQS+%r4pvS#8Yz7tPOMG_DCB@8F^>Mmz%&xFELWjf6_E9P#y7 znBAd&E-ief`f3N3S$2uUUJB3+nP1W9T(m#TMNMHNEoj-HS`^d}{```>@=4RtC_*eG zJ%@df)jUJ{0gYoPfmnSOPVG7_MLf45?yJ+mpibW#qYHgg>C~4^L=-*GTa4PielR=C zD3hXaGxf6awID_vqT`AXx02CInSMo6C1+57GMRBe)_q*v_N}ff{lm|^!5o_M6V!EG zJ{IDSVGJRUE7a?KiHd`Z_w-YSB3+C~C?h9go6;Gc@T+}ZYYqt^-@C4vr4{oo5>Y=r zYN#yP@|-1jh8;zFo;?_Kq-ykbvp-o!U+zLHbaw0ZJ83w6BSmG=sO3nO>}nl3g|djg zIN`UG41goRUQvz=o=VXU;9PFBUVR$hcsE-NAiH>75q^%o#=oxIvavDE zvfo^M_O{09p)69{@s(nk;C-AphJ9!;`yA(C=5IH%^^WdH?L5C=Z=wfSdX0idwiPQr z;0v*$jN!@LFJ*rq+ts;+=C#S~u_DLQ;(27wLS|dOe2MKb0AZq@gI;%RyMRHO@t-l+Irar~JA~lD zmEi=hA1m)Wy#8j|(i{v)2IdoIwFCctKWx&$AM?UkIuJ^+_{AJQIN1QEmC}7Ob0SL2 zy~QPfaHeV6>%88*%-ky@Y$^CbzE)g1_2ZUNE%(s|Y{835Q{%(uvY+LK8J{RbkrhOm z+)z2N7`_zbOml5rR*>;FLuCJf9SDYTVdO44!Zi&fT$sJIh1?22wOZx~HcWbp6w{I0 zj^^n(?|eCV>YTB{@}f@AZa|REU^!ZVJ-gp!h@2%!nA+=Gf0-k93d~Q1WqAjXb(@s$ zD&M|G%09QeKukwTdq$wyNprF38|y({=*^deYZzWi^O$8qvDpbcO?xGlo+Kt#91H)J z%7z7Nmj{IE9E#BM<#$5kq`T7C#?I7CQfY~U;@)Qx4u#>p>uoQd-jjXa8!#P^1J!6G zqlX?at{vMRJiyg;s0s^6_Z%oarI7?PZ~f5-Ka?q*E@CI&bvNFgT1>{LI|~Wz@!LlB zevjrPA9XdpBoiB?Pe-QZm_s`Vp6a`vRRi)QxQdp1b82-nD&XpkxaM}BE>+U|CYGjK zXN}lsyt+Von>YRDazAyNRhv-FcfnR5(KYs1L21a*Ta8OL)qVlD4liLyTzX;sDFg~~ z8)sV#XAX`Od1z{$zM*1v+fKFL zaqn~p(F&Y)IgXF&2KO-kla7Y4 zhT(PH6)QbxprZgonXkz%-a>@}bV=}2R$+s^qqz2e^#J1<{h`3!{KlZ*U4XV<=ErM- z0vM5|s`ab5^GnFQ>E=+J7#1*iEa8)BUR+GC{Ia-q89f`|8#>CbStRnV1$OAomCH+> z`wF*MlTT#I7_C*md@`PO9@cCCgd+C5hy%w82yt2ntiAP19T8&&nX~oKCD@s=iYJ+xK8VT6>!Z+J`c{p&d#+;D zyPR^0j@9(8<15oshW?)5O(Ik4ejaymc-LxvXAs*t#|@dpj}}jPHb~XlM z^%aK9e=d-{v2B*qwO~yLglK@H~UrWsoL-(2@U-tNSGH`je4d(`!EdU zGAgE;j76T=*nt%!hha^jJLiLF=&^^9f+!ImvD?YgQBmQksgQ9$`ycxB?i(7z``sb4 zG0C-Z#7j?~np{}o*racwLdaD;+ctepNW#b_nIA2L%;NUj9EM@%XFXVP2m7G09NRIr zOHILt_Arj=o)nXY%F{79|HE1LwQBEOVXMZ{U_8~!d5qVqfv@uf?`iHS$Ne zaQacO{C8yWdY&tkrij~@-dE5PCYvU6^xkqWGS;Hu2$oHDvB(h4d zcZC&vBPc{DuKey9YIF^!lzt{WT0>6Yzxwls&-K7F$tq0KGdglA!aq7s@+ZI znsIeIZQgXP7f-(~hE4(Lsg5hxN}H{hR-W&oVl;0QHyf&2O7fC;oC*~vlD+?JGKkp! zTE^97lcTak+DI4{79?a76A9DrL)+NRdl^!YG7{WeegzXl`wx)+bO~DW{!Uy&>7NcX zC|aW$9x*MTl4L_-jp%CaW9@7k^c&L&l$BbvVzsZ{7-V*Y`tfaXuKpmGHd>#3im8K3i*ap}MK zFqefRINC!{7H+I?mR`nC`27x*=lGmNXy1#lAw*J1!9wwoWid5koSdSGC&M}a^~wkL z@KaPmHSSy$b%tj>y4GlC)iNAfxi=XwU8g8)LfBciT}1pQ=CZp(Sp49@g2bx4NC13& zE*`{Ss}Pmj^S%D+D6f6ee5LT*>3~W+No6}((hT}4mZd!bH=Dy$h|@;H6eIpf5o|y! z-cHTZRntN!TNdkHS>cn7M?8ITF|1WrU9uH@c4H5lc~z*X+*%iCGFX1I2Yl-lN~9J& z-0IghkE&BGkVj8rS80c$dy_q2oxCyUC-kXQzNOWcDi1H`hP*T4-k9Xk|GxGX0~}q~ ziC1ig+EaI`kfcjHcZLA&nCeqmpMbEyr(e|W_@h_~FLo!_2lM_+qYNfJIQnQY*%eyI ziwt)**`mUZ|9+ZRcdz>S;P|`D59lkg^T|M;w=be7z7<+1rj$e`UN0cP5WKQr8fP6$ zW^CkmG&YxuqkEI;0mW!kfvOYxI4Ye9d|&Bz*~sOzrU>$|!_4D=}L69Ad`zAMNkzB}t^-$dWPKF553u$j(n zmTosj3tqk3Y#l8m9jD(DHzE>~>1h^no7d2t`?9$*tQS;(k(Cp+wQ(2~70}GJX32cR z+_dVpXfICA3gE84&sY^(T;cHC!dl(@N_Ad1WCgjROZ3{#6F$nGDG*e;iXp79ez+@y z;1`ibZBCsMP-$)c3E(~UOW9zO*iG7x8x>?uY>J#Xc9)E5u`E{_voYqA>7|rUtp`7P zjQLEKiG|IkD;gziZl|wZO0pKVH zy&wd6K{~&~Jz52%3bEyyy_&(%pS#4=nD|zY@QI~DM!Cb3B8iAJWU7}eu~Gyyu#=)5 zJC&V2E(HU8idf*bcw{<@vl8(RNM>GjO|8!Rg4j3hr*uwoyiqlL<~Ll zYO0$%-)_L^vZVqeQQ3=A4eX24lkL12h^Ag1XN5~U>%(oF%<4KDD1E=Ui`s&wBi0a4 zds|vKqDLne05=B|G4j#XGHpf&2mU2b+*m!<_v?_9%>t#CMi7ViYKFGXxZCu{@0uaT zT>^lu+3!j)+w2w8W(mQm+=euKG>qmQg&DzQLKHIp$J4+=|Kn+B*~Lw#Lbr8DSJ@Rr z6j`QoI%6(#0^%icbe(C<>jk$o(CPJ8Mj!oO%%v7dq=jZTahXSGD2FkH=!&e)5^;Z= z*3#seCb_jkLb-@6wJ7E?uDes(%vwyV@b5FT<$P{tYLn%DESTrZk0^0zVWB?7=S=(DD&0hpz+F zM)onCd%4k8Zn5r=6!0K%^#tOu=i)-GAzqp79Ok{to_u?)&ueKg(0PQ)a&psazkFJU zaN0*0{NW7gVV(SGWxS5Ln1gMPoxM3Jv=BG+ZOCmsH6Gf;8P05-2zJ5xq*;d!Tr)H% zwo_c;c5-fX3w-wA6!W7Fw^aXVD5tgEa=@7ZINW!x|8-#vZ4dg1kg>YMYM&NGjVRN+ z**C@-i%9{=mQ0EZb2ty3A)ty7+(S6qLhN3gO&+3C%=Xpb>7vhUJl7Q=d&MdGn_N$O zjH`tDaUuX4U@x21^B~8IQ3sz-8yEH02n?s30)yx0^7d+6l`kfR@2E|>I+M5n>6k4Z zh{aP~(9zHo)Jdq#6|08tfmrry!;?YOt+1@C4piSx2`LO;e8QwQ34F;Qe4MYH=-ue( z_g)9x8~KujPS7SEK}IPjW9w@gg8L8+7Yj>A?Ji}(3-w_P6beO|FkAxcTgckJ!J_a? zlig`8{PO4XMHl$q;NYNGqYG@V+j&atN1Rt}b(c>>LGeU#g!hrYTi+=WR8{PN=2GnG zQ{ODrL!JrqyL+2YZ{ORF3Hb9B8`;wrxZxotUmRwBPmpIjPM64b`8peG-49j2v!LtN zM>&vV+6cXxoucu6FJ}1e>quG6rR=r2`%|qwElk%R(hUnavFdB*C2wkqPHYcxp6YVV zr9l`wD2s!|4l!NZI1hPniVg`)HKKc3sl!pz$8SGVT!@9BB^_&bxTH zfBnvg?|U@uQqIjmq2tJf)9T?FBe!oZPErSjo=}Zq+LIe6krGy{J@`skf=m9)m5F5P zB#5KMA9pDAF4el@VoHBFY-^h{{_5s%NL;)dE_2`j6d_`^xV5S44|TbP5WS2x=7hf^7zp~Sj6STk;{zse`Ow=hUwp&i{UXc!Z>Jf)@RhgGPa zB|jBeggow;iyzx@Z+Jm>+*>74`$1?`T`sb#Mi>&H_Jq=2Vd^g#TjbE-%G=5D@{ewUzx|E{*0W?POg-|b=U%9rwb8H2c=-%ds_;eRY7c`)K3Dd8*6V|-YD-I-b6x%Z@!Z%`*zC#px6V>= zoOFZVVZ7;TZYAE`?mN;OZDdS2s()6fR#Xi`<3EoWHlz*fyUc3}$Mf#LUp^hGlEvKe zdw3N6-IxCOczewS?x@%jPKP5+!Y6YO=lV-8jUbvd?Vx!nS6NCIJ~bTkyX2b1y5*;c zEJ<#x`4HK}g8MA*nkux;_3Crkov_?)(%r?-!Gf=#G_!2=)-EX)u)771+(RC2mT8sf zsJk0J+mklebhLlG;w6{r)^c7sB&Z*q&|FlIZt4~3=#!>liI&7x2@|88eQ!g124NtQ z-2BReyOe25(_8V0W^sl(bfJp-i+}dDF%HZXPZ!6IX+`BiS$Hr(TzSh&gJG6N0A+IF z52#pYuIwMplUl26l~?unKB^&!NO4X+opb9%R4(~K6{)=~~Ri(=tE%qdxU zpu15Sf%%mpF79raTP--u^FNmxYg3vg<;{G%8^_nBl)!7QU?+SXrEJyy^&YmAK;cWg ziCQG35z&S7z_dH0a4sJ6vK39XFYFMTT$_F60?}hq|VMnc69_;zg<2P=c4x5 zZ==`X?UH6m_xPX#v|F5dBp2*vu!33Vh4_*KTx62*u9`(4H8MwS*7<@-1L~{zKj8kVJz@s->f#-NWb zIDl8uDMKWkH&eYAUDa`_M7K(cIc7gVIxhxGH=pR#oM@ceW$#zRN}}2$v$#vA%h6CJ zR@%z_dz*U^xWgS$Op22~*6!Y?!Y|Oa@S|A_4i#K@X6Dm0dv5MJg=M{@IJw~udRdC& z_^Qw`bfw?S4PK67hHIsn=W$>q4s#pu>b0;_edwjH>36w!V6N~Z0uxI*4_Xo%d&;E2 zd*3rVF6mnCO1R(N0pwSG%eNrPb^twglb+aGX%LlU0Zyex|E5^pH1c>FiA(pR%-2&3sfy4$xmIj z=QH-JuR}l_rvEXckVpC|58Ts^=QM@(RN}5n0kL=OlI{R&|y_d6L~H=Ubm}MNTMQseI#Kx z#j#qkLgkOJOmUR6+lkbEaD8_wLsjiv>X&PQbLugd{6ee~(8i_P8?$ab;xv~S;52QS z4%s@f&K5_bXd=?AHvb-G$(yT{AZTykMHlkB*!l2$)-^-Z<47CnvMHX4EyKLq?I{2k z1M`CR5qVmD7rBQ+0SE7yZEace`$FQ{4|-?~UF__t<-at$tioa)#pZUD)?N%u__xaZcMaq|LV(j|YTA-JT&{Lq!$N}SvWVfF0TFbt(z6b`+_2H%DbcSR9V#Iz;5Qf4S~oY2(Tc>^PNpF<`5smuz5P!FprHHUcebYEm$cJI7x#L|CG3DYEgJ3(6#Lb6V!x}ez;*#ESAwdaJYWV;5ny zs!NzRGQH-Pxb=rRYfooI-=Xc^L7mm=A?C+lu1)KiE!qpp3-(?Q;J~Qapb`ToYB}bs zipXAoh4a>v&_6sg6Rzp3bMZcuQ~3J*^!+}EXmr+P%}Jg*W#12-Pir+T9M3{(O7#Wp zu!SLlLe}EJdlm7Qkhd2{O?LS-)m109A)M*e0kU=lh95baNG6k9H*z85E#eo=at3&* z<6T<|8%^)VPeHWiAGkKY}jCgzL z)30{XVL4N4w;pPjKl2~#Xm>QQ%}pg*f?bMUU83#OC6Y436pFu!65JoL1;B@i2E*8 zv&;!|Sa4Vs|0gpSy&*%p-0UE|9$l11By{Lc>5;tkV@5n{ZGmKi#-Lr8Nad^DHzd*G zW|;<3dA~9EEC+F-nUxi)tQt{JT+NsKq-R0jVn=5n5j$&*BZ0`A5*^_gva79Qy26w) z{dbr}^tJ0A_XZ9C{*_pv5@wwD-q%X8vi{ge7mZbgj~)zKr>eM{?kwlwmFW;>Jt-p335$gaIKsO9$BwNGu0 z`m1?fJq{7G5{6Dk9b#{Jvx9cX0ZhW}wzaIVt%#6KxMzcPP6!N`?KNIYU`cie1mB=l zwk2J~KJ_Hz#LwBUwmkJDH`*mT`lM>iowD2XA%;DR@&acMJm9h>JBj46XlY~MZwZ<6v~fV(?4rT=_nD_wE#)yC z9ih(HWi!r%>*fk&U#55u&v(Js$-%)W*B}kWJejUUQpzLS;>ZRIgEN zCQAYhIzx8PG=MjT>;a0aCBZ{FJ|Ud3d{w+E%b^F9cGV*euRSqr0(+=;nIc1LXDpIy zyCGX%0sG#?Fa~ZI2Hg}x11T?kw~wodW+V(IosiETrKzpnQyo&uRym4o4Un4*cQg!_ z1YLw26S@?a6o(xMxpE>MfOfnyzCyIKjhj#U|AisIxpSAT68$%Q0TzSWJ#X{=I6saO zH4ZF1ev>am_7AbMIrwFJr5<;ivNn9XA8MWu^|!aP< z!pzjl<7AVB#w?LpuD{SAK2j1vC6aE0MFAMEW$5(igv_whyN_S$^dsA!vB8dy;xt%8 zb)-E1_&7NCjk=R&$X_msm@tB&7(evEzd%Pv(O-yluXD<5{sry&|K9lj4C(*dheAR* z@)%4~kE&=IYX4Gh$T4YUWhe(|^3x_gw#p=iSLpu$+;|sZzKGO(svi3oumgyTv?-Y;pHBCP2_Q)>JL{bnUh*yS!sq$o*KBF3UOm6B zzcGff?i^A~6vp?I0)Oc~@N+yv#zlIIdE}=$V;#*Qtr{KRM08q>Er9zth`kg8;CbsS zG8;%)%QTp3ntI*-S{D;cTkLlySDspiG;S3YziC+2PHgM4~iAOujENrhhvno4P|fBl_GS@x;ir7spZ={$oAu*^|1($1ZL5u{Uc4 z*iz!R|TaYNkhZT**HJXolG&hcmH<)z<)Wa+(K)dtB~dR43F#9`(hDZ+%v(y-IqH9o!s4SkjZ=<9zb+$@ShJ`&_=s0C%}>gdc8mS*w0CNS z4jKTx1qk6@!rgwGz)SMvEg9~I$>#~Xt*ivzCXA~fnSM52^-kA>q5&f6%Ylme=WV@3 ziZoDHWgqK<)|*r^XX#P&nfHM-C?J@t`5EkT=aniDA9a9vqGj?06)y_1waj(#azVTJ zX?Hs73eO*oXt1vpu>dHr1Ba6t%c)dOSuN@ZcHbm&(GoZgDNUoyFf)Y=bbQI$^^XBK zXHorn{n_Mfe>@jnmBr7>odb=a5+N$cBKVW$72e8TdDirseuS0&Tir`;l4%_N=j##D z53D~Wg$1^%5$&&t`tq~tZ)oqe+}^MboP*nq432kqQ5&dmoML#8Fe0jfbX=I&#YQ<` zlH|)u@E{w~HDcR2(JC-aXX6Tgh91JHMR2y>0kAyM(jRL{8>^i>o*k^%2oaN=U!ERm z4(*r~_G`1gQLUz*fv*G;yp8?QQQPNqU&{wDiS&(ARj`9%b^X(mj3~$iJQPBg*!GDl z50}2SD;2&$!RS+yYNx^L%#zvQ5%Itz+93jaLLddz5rjzWK15oexSh7f9YB6X=Lcm8 z7huNn?Z6ZNm0kpwmT0XD*P53*T;kpJjx8J7!#|s>Ykn9@tR)09_dLWCJ2N_x%k!!_ z0>UQ3aQce(Q{#5u*2^!1F~EZi zq)dH${TWVYFMB!Gv?t{eJ8M6aNzG||SCNu3UG{S%v*y~(m!>|qrucBEu~Y(lfAN^y zWhNXOqAvA0#Jc+Ei>u4mBEOL&-wt09@}H>HI#+Li+{i*CnwBJ0uyCYR)B^Z+@V&z@ z@!O3yM{{cq3HFTd!HK?}^9_mcy#?W!j)G~rv;g6S#F3!~QC~Do8gI_|6YD~xX`u|k zZLP%y$3-`qa@r)`cOQLW@GA$f10ylT0vn&jK*2=J3ajeA!XsqouQ8wj%c$3}wSJN$ z;f9+)%Q&t9mKI`znv{X!End;gnVBP*VaA5L61ci7yl@cau;fSD&Ed*K4e-mjCU`n4 zloMTO#p1%DY~u`Moe|#4_oaG7;Oq#N;)N5J!G>2RM{CCSrTBc;d_PsA38Gn(P~Y{# zR(O)Tg#iVUv(Q7TEH!?_cyHq}K!3>}wot8{6Rmx#nddn8wZF%-v}D}EC|o+!#yq^| zQE%(cJqjARNKZSD8Az9c~eLq(d!%kxbKq4qdPTyQ?M zvboyLWa(}IIDScfbK*9ouSCf(Y>whv!F)w^$VsSpV#M0}%rL0rNeBGH!9cKEsjCSA zPjS&BS*_$p1Qf&1vz0re{1V``^$BPzcI~g739KUTv_EO_M}gsS{Zb6B@DIJ#`SvsA zc~u=_f&Sd8A>>UI6G)7 z?|ND1PWDF?n=v>d>0DK~kCJYQ2Ob-t^}zTZ!M7+KpVFBNVdpflOGV@#*O`-y78jAq zvb|7r40E|>!%*LhNYBU@WUf}aURt$f{f%)anu@If5VP+J?#|y0?x_ZXbYFgpp~E{S zoGfUA770H1v+NCTn!*r%&w2#1-C|xNB|dwH?cXp_9}}uuLA~u~*1xaQp)@`E-Q-M0 z%6lSnAi2mOtHqTrb38Q*Hf~K?THnU<$W)CNc})Ci`nl%GTqooF4X0w_2w0<@f39U% zkzKQ{?AIFNf=wxqT^(0So^%lJgEbm)?AVDX2V;2zo$+xmdFPyj9du+zrJbjM3~*(&hy#hM6IV zHIP^R`jB0#^E3Bz6(TIwl6~#cx4KoM^mxZ|1t&q;&@9u1%^JBmXL8v%%pBJTH$M)- z2!y`LB=Cu_xUK(ehXrJB{+S7@wamHP<+^)8=-y(jcZgnr6M>m1r|^9oQo!7fk+c-u z!akVKwJiA68yyBwM4qkK+0pwF%3>8EsrivPzr&5OwV`x-*`wp=6QtNJj3_{ai7M7q zpGhMkCx|2YYFIE6jYC1~?aUOgCzd0lR?R1F1Tcr|_!i2as|x2IJ|Vn`s#$pjcV+Jh z$dlIT$y!kGQT<<88SNM1*y9pB;VCm#lBe7fHP!o3TnW)8erJQ0n|2mDbhS8AIdtj8w3&?YDg90m z#~OBTtu0k=%8Q_qy()P3L*z6KXx7#rlBD|kz-5|el_LFcg78b8J7=5sArwqN>OB^~xY}^>E4GhMur*oyg}6b1FtWf)Ll8g86}sC#|ZmXm|s? z6{MXgO-;BH@+N_U!d}QXD`uVZOBTvq(-B`xCbF*z=yQcqou#Hqw!+s+4`Zq7J|-Qg z^i$Z)nG@@N+GnkL(fO`UW?hv<)g+?5>bQ@+a_9N^%8mO%)o~B_MbN$PbfZ`Q$N?*+6mv49Bg#YgcT??# zxUDkt#r7xtH(p*!3JG_jM=gIJZo0dpmESj$v>25aLqu+n51EaLG$2(hXp`NKqmd@e zDJjzh*OgE|qJG|~;U5!(8*3i*1)>Ef4_+s+Na!h!WB;KPJHn$8@V(EJg<|K(2u}jD zhO+Qxmw;9ze{Hp4V*J)^57<-fO{%eH5ppm^krqx{E`OE~T|^X~cmhSS{Db&^Dq|tIGFDxBkiPht%`hu zN>F;j$0tN`&^oZ9?6UZd`|^-SmyY%;L(B{Qda=y63=7Y-E_{WVc6^v<1={swhaLDN z(nQ^zf!p;4U-}$B?2B(J7CX11L1dR0`uL~+){_HYr^uFSUwlIIB|pGz^uzu03|E{6 zH>JM{3QqBC($J|+eVl4vx;oAr8X23QRgWhR66B`%rDUnZ*s-nZbj~5dCx_m%SN>KI z4~!4;YcygIu^31QOW(DEXf?pw7+WP0#X3arG3~yTFV=aN1j;5SO*Nkj^V+{XIj@~# z5219Upy#fvVbdAg26lrCE>F4^Mk?8}m-8PP${ zOHD5ZlEa@ZfBQv=fS{`jCa|;DsJ4cDn8j$xPpy48BIf0ZdoB(dpN`krA8G#ct}tB;a#bRZ80&GeXBWQrr-6Et&}D>m z^w}Fk(8FZ9yl6DUg@>D?8RYB{tnIKrz**v(!mkg6C@dU1FIayQh((erLMD$8eJ*qh zHguY1tA5v+5Auu-jUP>x=xQ7)NhC_Hz`tMPqjX^O-+%RZqv`hMvLia}D906t31g^B zZ*!QQ;SYb^B9JVxb?>aB4OI}pb8>L%`hkwo@pl(F;adu6^ zS=@bEHpx|dEOPWk<6?J*A{@d5rhc*qo>R)D7rxCLI2>#YI|-C11G~2qHp}xr(rPT- zDu41KA0}~vdskqYO~kC6^tzE}En{FRaJd3RY~G)JsmS`v>i@^yTSis6_TR#Sh@^BW z(%mK9NOyNhOGzv`rKBW8y1S&MOS%MUK|;EuOVV@Qd%Vv(&WH1C$N6x+{l7W3%f0S( z-BLrdRD{;bqeYkR?Eh&m%Lj18n$P;z`unUQ4h&jc?AcBc9 zR~f7L{e!M2I1VPBFTZDV4Ej|{+B)2VMAJ{q+{**IOqEaYMTj?&b^AYzF=`<_3keWm zz48ghQ@~R)`aamKb_4RE4B3$szG7#DZ?}4$?!G6m|NZ@<-Si@&JU4{9I#D|3WtYJ4 z%+QG~^~Uta+hFEuZyBHq3W&7Q%KY45JTMcMen$N4xbR6HpQof2=abeS<#U#UsL7GI zy6ao5TpvB_ZFJ}H1C6kGL8wF?{ID?&HQrJ#lE0c5e2%V-v<<;m@98YtWnT~RWux5u zE|ewgrJ|`XxWd7n5>9Kg%yxxbwKZOM;n=~03aL8Z(%Ic!YlsUk7gg`y(AKBC`PqbK zCLi5JqD$giFZC1h(>bfJS?~y`eZXTIo5Q=&_W5&V1dX4Y!}X2?$`gIwkuxKyVYiQ` zD@yZ0<6TX&mplE(Zu@*Ln=gAGWyXVsTw{abH=x*aoaJve~j%^f)mq9{3%UU^PXEA$T&DTZ~q!8eO?Z=C|dU?_!im z5Go47I<1KR9Jjvn>be%oKl7}v968ge(c`rluBQnbHXF5I5+F?AU-x)h>qsiHu}CIjjTa$l$angg zX`)4ZVhRov#?6}k&Nv#u^m9=DH$vBHHrh{&0yK2eaO7-c_wU@jC~rw2LArpONvWJ6%*@YDgaI{ z&S3$bWa8zw)2LxVedG7-kzi3|6Vg&K;+xlr)Y;R>hz7pYmu%FPw_FUN#j>|m1*-m; z&vGFzOE?1Soevh8s!2AdUmAD9su!pe7?zrX5_M{^F%%~NT1nV69+Q%Uq1B5drzlIn zY5C`Np7B?AHvu`uI`k*iwj!LQE?x8{!z$rbNQzqRz=MgC z9nLJ#(t%e#VOV2lJCl1#CTJ1x!!yH{<-86Hje40JV=Y?mpuG{=E~J0s88ttb)Drk- zvmh;d^NcWc6)kWjG%N2$-&|PHTaMxy*(|-)6~yfHEm{b^_bwbb5ucx;SacXgSW`16 z!t#v}{ftmfnKz~H&}8vkOj|(PcH{E3rwkbG;d)+ar@5NpX zXml&0lNy_Ig%*IQzLtH^{TKlalyz?9QVfk>-X^L9ex0YN{N4X9@ZuXS=}iCOmH-M@ z0$T9hm*1VN*=d{4>+I1)k;2XPSiouHx#8}GvA~Lhs$Ux<-1{oM$n*0e30xlvVS6m5 zA3nAI80GVu^$n|A^ThjLikEcrL-{O~O#6G!1C2()^zAXc+-LLFDnd*=}>A8@{n8+9goaUWr_}w2`&wziy zbQg1h>BZ@Zt+6m1&aHEu^*!iLRS74NkQ1627Ch9+z7H48LQcMA&Cow@(rflfLns#- zMrD>!%^1=Kn`FWcdgb@}9s|UTCoa;Pn~Cr4dwlAqU%E0!=?lny8u6SNo~iLn?n3@L zVZ&j{w1uO_rW|RNQws_CJ&L3c)G=AZ!rrN_YFKu2#D6e;h+akY7 z^71yB&&6^92CzBAC@z@+8Qo{XtmN)g>m1D_GhYv=&K2S;4#xMjPJpjp1m?Vu@)fY) zoT2{tsajR-cgW)G*XQv3c28N#H;(mAY#o#1OARFx8{X0WLQGtP@WPhX-aHa6$wcNZ z1q)B1wHSxxGe{<8O}Sy~y}3Hs7!Qh)o3Qb4%+~8gZ3m?B1Dd&gp&G9e z=nZ|)|5k|P`muBVv+Y9^D#VOvgaam&7W2V)9Rk*c?wZ8}x#*d_TFf$=#zHN}(Mx4X zQwx3OvLPRCr({9T6MiGhd;Um#W|VSN*=E3V&Fb2YY<;~sxOvw4{+Uo&>*;6KtOvGL zrt9K=BJuoHyPiGU^Nq)l-Z;H+PmrZBi{=un8LK=DlFW{{dQ&+c^o4?l`1A1?K~Q)u z<~CvGoL7l(k3r>i^GY_K$7ge4JM!&q(#fhgEi+Oz;SAa|l@u6r60^Q#(UkARA3YCU z5&Y8O{OCf2)-AKV>b%Q@Q{)=9+RHnMIR7a7GQ{Lw>7+*GFq#-c>sC3UJR-1^7mLsw-Y+|J>q+9CO}VwPKj~9A zc~5-*hN%*YNZ836b?O`{PW1MVkyVr8Y-sTmkt)oVCB|G=CPH|)2KlA9x*io75Zk^2sB{WL%SK zA@8mZR&x7PTc=(hh+#bDJqS~H{-{bn*VMm^5fBZJ2_rzZ(9Yz53w-3%h9w*itn`}$ zR*RPh_1|nUzdYEpiV(c;BxmqtUUAa7gp=}&d(tZDL>a^4FXZbO{lJtPsX^>d)a;Mn z$7?Rib>~hKehVBo%VOhRq>0*DL9(5U{PQI_L~dH+By}!^8}=~)o{wC#D8DrE@E>JT z-198S8qXTpe{3;?Z_lz@Z(7@Oh$LWr6879pXJ2h(hgYc?)RB4G9t&^&w65ew$q^@W zcU;5NRf%y>7+DQ{j(Y>QUY?M=(G)yv-LuI;I*SMDfVTX_Hrp3!<{#rK>+=R`TDs>} zKtpkO1mi_Ybz73`E16E8R{at@$(cV7A3XT;=ORBI@jvlUzcP>)gw(G(F(RnU3H-%% zDQct=3dD=4n9Xg;<>R>JRpT`B6(i&0DL#Iy_(nl-AJ>!hPAySfvOxUq{xrj@!euL? z>9YMW!@cS7&|5g`GNTD--_ox#Q)&grTrxSw(kb#O7X1I*+0o!ad9Y`oi~pk44QyNt zV^CqK;Qgywy6I0atsptbV)PNJJk3X-V&e^jSgIWmnXcsdnH=Vm!+3o@~&8=#1EUH$ZVMsMT({^l^t=w^P_?fsSj)>d}V6FKNg?z#Jr z^tUK*R1?-b@IQ_;R#+#;D9ZBl;=lNAHY@>Azt@25TjaZ2x@_mk0GfE4<6Z(~5l}Pm z_m>IWt@;A4QW<(do^zLgy{!Yj=I+$TfUu&m2VCbyHHQE1?j7ARFRg397{qhWnSVI> zzM1BjDGZwJ-3`IXcbRaGKZ`xEh4{=)+z|eA z(7abs2WAo}b0PUu5+IYZv57sbqBH6t%Z?u@6C@JuPy)0LGmJ3_K58 ze2mXJWrq6>xV!8dNPC_vVnPkJLU6l0fKTq=~fj9MsrLqk?-;fzZQ%?e& z<1F9UF#)hR+c<<;8YlZ)DR2d#dkfSz;oUd5ro18i%S49vu=R8Tg|VD%^8$p5YR&^@ zZg_3Fp3_l#`MkotfS~o~BeRMk#e91zu3jRSxW|YToj^;S)X^GsQ^?as-?oZZNeCR4 z1ov43-OryNdPp~s{@MO;?0dWK=+Af~3Td48y)lm$M-|}hqr$>IVOY|n%*B-fm(dr% zPS0(lTLr@(L<`BscunOslN_u@vK~Z*Z~yQ;O-Oc;rm^_4de&MN}O{zhQy z`xOu$#2P`t(EXVgbw#kg5yQA!cFaKVK{_GeajG2kHSRgE7BrTLDmd-r^@gl8qo$EH zn>wpDGM0yZkrf`p?Gp7V57frm5=njNR0j@%&S*#AX*eJ%=$F?!)Ah1}3ZSky{J3V> zH-;5^Jdv3bP;Duy(g`M6T;-9flGLxB4J+QzDUyevj>NWuI4DKCm#` zD=D-*S+eU<#kCdFy`b@Y4*sS>`-tih5KNc|85Jk_x}hzxbqVYi=ka4mEDJ_`U;SBS zZ1k>^F zJ@%0gOF5Ni{;c8GxFCJ3+&u0{g0s@`SQu={iE?3ur<1T-TjLu&C>r!;DU;{{-KWwV z;~PmIro@aLb?Vts$7{n@fA--v7R`A_OGW8r`c*bj^NZW;K%yh|Joj(5Q9Le4FRys$ zYX)|z&cBR!&d()%04H7LnMPo2tF~xzP(bz5Q2V{oohxw7h#F+e925$jT7<)ph-xcc zvsztpom4DPrGWoZ81Tg$>0#QchYuUPhnK0PS7p^!rgx#l6Fzf=<=W_z1DNrctVs=W z@j$9wm-+`kR>>K*lW7`B{MynZvT@smw_ZoOG3}2fuAMP80<QtKY8GkOvVC?s+cy!Y{Z1|gv(q@o4DXRRVKGQ-LFh5SX(ahNdDI2;d>R57;q8<@x zNabtQT4d|^h>#*<9{jf9HP=EXdrfqOTJ#mlD~#Dtte8Odhef5B?%))s8%B0Vh6uS& z6p>up@BVS~4oT1y<4D9sYA;py2d8A~*M0qk1BbeCDeBPOmZca);XDR+=48~2o3Bq& zWaci0leVbYh!BdslTC-va$oloc0aO=gkq7R8#WE4Jzg5>pc{l?z5@51vdrCEOH0Bb zHp?2Vhxm4No-5e(UVb#Uh3|O*(S@_Gvg9^F!=aya!Ry|n^ObW*&;TiPIPX@_9(gFH z^hbu^`#ytbd$>o(h?}-nYT$?ne8XTK1yWuRKh1!;e(vf6-LC8H^DDzjm zfX*_(mq|K;R)vamT9_@G_4nI?FCBtR;?p#Dxri;|np{ zdvh8O%@tYi(Q+GH)6fefwk0cWEf`u99B7Y|LJLT!^5dfog;i;W@++_0PTRjOADLUE8ExU3|dWA2Z7?2_Z&AO*4w zh$01I`vH2Hm^WxDPv_-ilacGVVZI3!3=7@u(1+;aMHC=m}2Q zjy*vUKlrE*$sd((&j!{8Ze^rTKvfx+n*CqKpD2>^Pn zJ$D|lzs)u{WHd38Og>4%RWH_$|80{0=SgEb^TGUhL9e5z@wX{wq64N}DE0RFU#Hv$ zaZ$;ce3KWde|ywGQ?M|eCT7pd|7~F?pg@i7*kY3{zyUw!f{6{~65xH-pKU1};g0v71@`_kkSHF;kdz_shyA({oc+ zV82d16&QaYHaj@HcRgs@Dtt`p!}bZtS8V_!lA5MEGCg(=PFscyzw4b1@1tCTHtPz@ zm2P>HR)l*XV5Yq%IesS{b*Z){Z=#1BRTT5$6Vdr5c5G)IPE~vI1;ut6+;A~#t}f0B zs7ti3n1%!_72~eSunCd>=@7UE!4Gi|e&z}UuLV%T+dlA@m)N)i;DJ-~e*KS$czKo- zsIxF%+MEsAsEeAamRk_ZY*uRCnNxrnwx0O$U5_{(K~fAJ>gQNFEws0!N2 z^gfW4XR3ZV52P#*YLHaF;5>W=z@nTBOK1cpAxtl`+PckD5cI zkDt4$@PivBsqo6u0G7#-7a(^M;m^f5{!Apb*X6Y?0MYU!NGjGea+}e$Dge*pA4?TQ z8K^7qkBoJdwF^KwC%ETk#KR3pnASzYpad_1z>AF%JZ%GZD7QrkGK|#&fZXY};Z5Gc zV*o|TPc{G%dBv8^^)Y}+UNBd6!hbIE2Tt7ujFqh%h9CxF2b}w|0Orxt1D(s0>Y(zC zUjbbj`ra9j!)V`JZ6pqHhfDtg_o}fvyQ@dESWj*_uk7J!@2i;K5 z;$nQLAQ*86u#*a?%ylYy-us^?fwd#&Ua;Z^BeD-~9$r%z0}M^=j1kn~9R*OJojvIO zumPH9`X7^pKl)xMZml(*RFSdYx_xgSgWi)m48U(oVA5xfg45q1MyFw(VuY~`4CSS1mhF{(9F{; z7wo1%`G6pDk+1utP2L3kwElKf9plR+w8b|@pSKlJ4k`6fG7&?pTXn^L0zA)` z{@37-ObE5G0Z?0+uQ2~HfNk9Azn4V@WXCA+c7-&v4f%fi2jqjw{6({9zR>vBfSWkr ze{X^p^Z_WCYxg|v9|m3_)27Elch2>&Mbh{O@WIe*C}7K>FNx-!r&H~N@FDetiZ$>i zn^`XU>#(y)Y)kV>i9l29-2{-Gs0pRh`3Gaolbsy_K=*|%85E<>*V0J9gd=uqtY<^O#q%>rI=Ia0; zO6Qq}h;%TxCh9)TNTwQ@MVlB|lOg1h6wV@z8dA^oxD0PcATZehZ6+RD4FY+R53a^< z02ZoL=ni^x)$W6xhS|gngx!)Y;mf8CradSWz=Fu+v#btm zT63}iV4O~4PZ(G)jF|%g48xM)Iw8Bkm+%Sg>CQv8;kPMAjmYGknY!*lbR>N@`w2Wp z%TZ*aO^(ewg%(E1e7yqWxb;A`IAP)LcMq^6GjTFJam`j5JJ1|#Q&Ny92U4CSy69-4 z9&Rs_`kswnZ5$|51ac3BHYcA%L$F-U+50cvK{WzHcZ!Z)>U^ktt4$eC;YWI2X>~&# zCnup(Kk6zQExHVU+Z1KY&{HPp9(wc(^qJcw+)$X2ScZat>BCb9#3E;VudCCz)P;l< zgm7K9qPe!C?C&K`(#3&_h^#;xC~~)>Tu{!E)g<8uj(e*TH2*{Gwq4EV7&L_JpDkQ@ z)9`dlzHQgzqkQQmzDFUKu(e5z1+@e_e8>YorIne&Tbe5tRPvSdE@vBsS0Oj4d?BAz zX^q&+Y8XcWL^sc;-AMO~e=EidUX{QPSnwCe{2Y%XKLUwN1*i8}z_E(*gCi$H;0~2a z54yGvJN=#7lfWgUY2 zC#(J1bABDVGJZRN=p}u<0$T9}yyuCYM7M{3?j2`6{T0$h$-n=qD(cWgYHq=1oEc)e zF*n~H2`a|?FOH1$UA zy`GYpkA`t~;|$;}6|%YQCM-e#b;a!+XeHTx8iQwR?wWDgzm7v^>FYie=GJt5r`k`j zB+u`J@@PF-LDE+w!Z(f`gEI)DZv2uyK{KPqr~BE`xAVFLU(?|4d@-#(WY8!O7P}8# zph&z^N)z+7AT)$0ueE5Mv81D+Pk<4^=Il^E(bu5tRVhgtH*%!er~8l~M-c~hDXMOz z4G&FR#Uf0y}G}cz@x2Iwrnc zyWZ{0^@7Xz@ycyGy{o3etF$%oG~`@!|B9X9_EOXoPJJoOb`SUM6^#AePSL66)AyZM7XaBh z!v5^FRwK2S0ifK1F{0D&^7vKdsh&R2 zpsv`xJSqP(H;27#v@dC+2%{NsR-HlZ9df!Z1_LxaWcpLnnU1w(A>rt801f_sloTZ$mMQ zp#eTgJ2O6@|bmU$%rOKC?s$fX}|N67m>tY+YKME z?x!T(|2P(&ZmSsg(R~!N{3$rDcBnDYpGE&5TQ=7T#_2eMTORaTQq7NNocX@ZmD*FV z+S^q%=o}lUw^G}%_q4+7LBP5Cv{(8wf`ejy^cP)sKwT|lh))-y-^1bWDVP~Y#OYa- z&}wQZO?DU}U#Qd^=YFDs7m*0w>9RWjT=@X6U^oji&~KE^m^8N^_PkHU)#bwN)nhZ8 z@abhVQe|K`x!Jn;!coq>Ds1QZWOt<*b0!EWk|=o^csh1@EqqfnA(Yfwp($iCTj~~E zKeVsBOtYnPL3lRJLHqSp&XJy&j2|ligQgLdz@F;9%WYYX%Yx0{k%p_QDqU@_R!+d; z%fDmbe|K&e^J3Tcn{~q|xL{OlJzbY}sCygV!a{}_bMe-#lT8pVICHA|%_3;23{|d@ zF{M~mV&vhDn?VYW59ju#Ah&$*EzA)Gq6zqyIWIXC&;w0}5ev+wx)o7m`Y6Yj&Fn0a z`QbzMMwPB`_dY6Y_vI}Sqy7C8e3--g*iC_#bLKu-z8>HY7XA%jw!5{3dUYEi=43cK~dUH3N zzVBlqu*o1F@TsE(zWLLoaM>(oG09AOvd>kpalAwM=46mI>G08=1pMIxU;YI*?VABu z)5s5>rRwI`{t|}sy;bk>=kR=&;R#>qi_#}bI1hkmq)#E_M(lbow1hIYB}Jk{^5;#j zN_Rdk z((_f{og#T*x#bA<6lf4{d4{JwE`PWtI2MS<62zEUq+{3U@}$@F^9OZJBPZhn z19-NLj9h{WCN+4v2{w~A85o&vz#hr@PIf5$GIRM@m`%MGY(+EyEAWRLhtx2oY8|(f zf>suENySWUG@Mn#Q9O80;tQ9(IhAmq&uKGMk;ZQ-Y56EpQMB=N>NsWYif&bz_rjcv3qT;D>Es&!eSley22C5eveH7*0$HEa5xb#>54|kma*iu2 z#zQz5m{1A7Z;@-Os9JI9if$ZZu4~TX416NRV$+<>$(4ATuHv&nzJPpU$Fu0F?pL~C zb)jaUzyH1OxRinsfjAx1TOGnQU*E`fUm2(jpGv2?>g<+#xvcbAiQLRp)VWA%!JKR; zUajEl@aUeoQ}Yts4dCgv_S4Cw-Iva!28|57_Uf>q%~N>OprxsF!qP7w*Q)Y#L{f4s zZJSn(UF>iOXd!cOy;Oz zIlfiKPRpBU1nS{ES0A{P76I#k3dd&5kDo( zdu=>j(-+ESQ@CAtW)8X^SV`V*xEqSH6DD?OW;N#wuFvq)BPf-&p)K^vbQ&=F0mTdEE$Q`vW9|V6y7qOp73Ta8=&++K{NJ ze@$%3|J@~HeSRbywl7cj6gFX@U6%(&icDo}i$eECvFU3hlPIEP9Xs}SFJ;~_%3LoX zU91wHl$Sbffj3(i>fGW^RCtIZ^*F(*GWHq%%pkUx;T|8d0yhMJDtf%0u!<5j)F>Kw zO7*x-+CW(#G3?tHjsCCy$SCZINT?B!yN9G7H}!k0))fv6J+@VlEkor0v2sc@5CmUV zxaJ6eCXqmcE3RK;bjpk~g{7Nh4G4};bvX4=z6vE?qai}l7nr5FD((9021=<>M5CZl zhW>-X&3JBq7-zP}dqSN}6k-Gwr$tcgvu%MG$g;ag-sw3wCqZrV9P5ZR1vm#e&IHwU zMsXIlJ?eK+k8F0P*2#lAru&#^D|YDdpDW*j3QS&=x4Y4{2HJEkoiE=*JP zk--k-g@cqCzob{)s4JSp2wl+a@$7!uohQA+)g_eaX_(G0j@b&o=GQ)RaY(qyzKYVB zaVbZ=`n3wh8Zx3JdNq})5}NA1|IA)1V-|~QHvbl4h4;Mt6{19!Si14uonYjLq?(4V znEuI*MF)N)X-4ql%U!QOS9F9%-WjB5zWDWW1@GLzr}m5)Gh6L2q$()E+B;071xhKx660uvR zy);+Tl-V4Mxv8K#Ebadwn0bSp`~-fyzw&)#AtmdQ%PK;yBhNbuR0(M`CN{U1QVA`x zy~zw1(B^BBdX3&Id{P3yMaGU+TYsG^YwcIex7LbrAtBiCO+cjcnO)y?DUZDKRWAU= z6`lmPic4U6OSt+*ZGwtU{=>M!jm+ly`rKH=teLr}AS%8SBH1r9`{ZL1TUYbbnIgHbmQmDQ1 zoi;2307)vAhu{1wIQ!mNSv~7I+$Ezno_)BytNDf3p7DMvC?jYweYfYb>@L`dUQ#5e zU4MoT6N|!lTh8Id~V0z*p#5b-;Wj0mDG& zzM$*Hcra;u@qpS2?dR5epdnF21)Q5Heh@U?KX^8e9V#O|QNt)J@1S!d5qcOb>4_Fad@>i>ozm19GOv zSXnQ5=B@KIG^R?bs^mkaZ9SR%HzDwr`SX;TS;d;DDRLYDPuuDMrd-e^*N$Ta@@?ra zBgJeX;CR^gJ6^Oh?I@7!eVZF|m*G6P0-~B+bIp-DxsVAK2*E1RQKpGm`fRKdT-5ct zK~SqPkCEuq4AOKREdtC&jlH{kFZFjdkmMD1VUnZYw7>MKwa`bxn&96nR~J^wK~m@y zb^j@y$3%zI1;mdFO%==D@;y)}G#;nskZtM`p3+JCoAzy{d?*w->QDq(0WYwtksL3m zxaLWL11mzM&64NVezUKsYG%ZHSI?*mD3e?k67x-R3^S>dFk0>vzL+(rU0|JwmVHd( zVKRGu9CRJ-6(PzWk{0ajFZc%_XL?BXYAb#9J_}vgmF7BVb`wBGn1{k1(F7j3w1Ka*Jhi=JXOE{*cOK))EuFkcFnqSGbS`{Pf({>2C4d(y zYhj7H;ZoF*c8RldiC24POG@|W0Ch!rYq2!?*B33MI#q5@QlZ_z=8XoJFwDQ_dcaA~? zs&6q59B#!0-sAB&>|Z}B|Kgbs+sQ*1f%kmDhPoB@m}*A|$I4F86!XPtO&Ys=n*>Ql~>e%J5!MBF6xgo<*pnsVn>A z%13SD2Q_r_t3KTN9oOOY;C`Nj9lCZcuxwxWgj z3J|uz)$UX~gDAxcfpL~@uQf{tiIl(prs-#q2!YMFWo`u)fS}P_CaN}|NeGGo-A=|A zqOCjA<{veD7>bmFb9E4?t=i7V_P1h365babwnF7I=A`gMy=e z@a%rhQZv>`usV8J#iLW*oW}azHH{ZN*X3&|ahi%jM7?!EO;!)CDX~I7#z^to+v~e9 z6OyV{DrvXtP~6|3Lq#0cz$02F{AI>(?5Gxx@oa?YGK(otmsjuj-!%>I z-{FA+V;*=)5qAk8uTCw24Nmqf**{ka(>cDn3zA#S)rF!B$srglw3Jpw)O;ctJH>=G zRy=ZHSOgL!1Spda4ww$?hr6v_4D*SeXY0xSg-9KDAQVC+*w;23CHWvj9#>N$g37@ zpD}dl?C6@o#b4ahvmvt~N=>^QUX2{j4yPL*MLU)zsVXiV)F>H}fTa~L(yy>QHk`eq zA7N#^d-IK3+Ed-3yh#&%fFgJW(PX@p5UC$U;d{;iX@gcdo!)69X&`Ysspg5sZV)qn-my8eS$;t z22`f7cAoZ@t_)p!J-S$(m;CKtOHV9vBgzdhjT=dbT^uc~dxhpksU-1JqR(ZlbsPZl z&VJSJMbeA=Tz|Jiu*XGHw;fnE9SWX;bm{ZkcxO_?-`2E>Oe_>tN3Zf_!>EB8d38N< zW_6YugQ3Rx>DrI13LXpzn~>6y|oCr>tyi>I4s z*w==udsY6etg_u}Bxti{UJOrnfao|@`nk+<{!`~RMF*xAQ9gBEF{b_vg#)E}_!Nzz zzflOw2Tw!c%O;Fi2ts}7Dw6^b?j^`7e{UyKT8jk?T=xG=QR4ro<2pa5^#~aGBx{@V znFTw>Noj;O*@`h(AqeMxz1zb_joOoOk0oLZ>Pg#!ksj5l|ny@V-}j+ecgR$u0Tx5fMEnLRtvxkCWzw4cwrgr4tg8 z_|*K>Y=+=wl7{iFI!f%y6P&&uBQz7bk1LLRo5+6m49%9&WQnpouyeMNzRQ{|nQ}*B zV0-%t|8+%h$xMUb7w~F5FK(}r#iCckt*qcqfI(Wpo_?j?+D3&{2~P2|ZFjSyfNb0( z9d^-G!5|z70ff@GR`ait)eG%h9^wpT&%Ql@@jYGtQPMJMHYrV}y)fyXKbog3^(V#8 zXlbkB2H!>bkwHv6sTqU8rTXPYVw2ApOiFd~@~rk|)ILGcp-ya>I8Oo|Wi&ZMzG;TY zZp5LPb8Y8D_hZ7tHO=eA#@~IP)G7ccWDdh01LcvW6f#<7B=@y&-CzQzf8F>1sddM( zvsyX&B#gz98ZA@8OwQy4x#zpANK?;meJ|s=} zt_Y-j1}T}`N*ptOmrw3_L8D#~!MhQ_*Kk?w8PoBml?`E_2^d*u-Q@``Rq^nBIwMlnujT?iwDgJ~ZS zf*Wt&Umg2oJrY3WaofR)@p$RiXaJH9Czba26aVNq_2Lab$^{-dgST&*9?K*mVu*@t%h<&`iws-W_pMCXfdVvXSX&bA1z){HcJfWi~pcAr6 z(?B}(TCxG|_jErN`vo`I%~qoUmpsxv^7Ennxi2EUW1lJH`DVq2Ud!EpjdX=xlh=jh z<kMeI7$P;1D;4e-No|`e1sx!(_zg z{%VVMzP#%j1B-0P7juRLBcc9`0lc_lEsJiPk1=izOB0}V;bkKO76sU4O*t0Rk`Zk1=D(h{f2$Pna8Hk}7uCu*jeBw<_E_NJ1HSi}T=-zPy&lL#A5f(~6RQWHhlvUl2Y0LRv% zWS}my3+{)IXZ5&;`hFRptfogOK19*hvv7-;6SJUgp?928uS1_6<=@A($L(4+0>?yl zNXxwQk(xb$#}e?G+$q3z1QMawKDS+xrhGr?%KKUp0?#^TK}l5OV3loaoh8bC!~7gJ zrCM;*l*<20)lMNllVl7`y2_5Cqz7-6uODb`aQSlWN?eDUo@pjccA<(g2omML!q=L$ z+3k9TXhm;r3>tu%N3YCYBcbVYk0%uw6(jc16S6UP+@4IU-)D-$_c9Gkczk*@E3jDs z8Z1ubsa+S7`6yzbo1v)&bTf=d(Xn`D^r;e(!McJjl_c0f@ex~=3qfG)GTGjj#$L8m z1aXq^FXU^BujwrRBqBzhp}GVqn|t@_MVB1;B}Js@ZFsQrc!_{A2HV+WqW_2wqnBw? z!{bmcB&n!^i@k8Aje-;;zrXR5weqJp86WLs-mj||i15KXNXCx#XWwTYANK?HK(U)C zR++r}TJYG(i159**F8)Hz+O~Z>JvLI3jb z;Rt`olE4CH5NtTEEKxHzEI1>ZinHdZew+hm7Ew8y1gdQlo|X=I&Oc&fUEiL} z4y-=NQ8j$7Dc|%@3vI=W5(l063~VKW!L(#d;WC`AN^dw8Sy3RctH6zq#PZ?+e#6;$ zACC1+393pW)T^dUSA)hG)LCUAk)k$x`d-CYMO&pWG({qy=1XU2I}dQhqX0b(Q=LS_ z8|?#~7<@q#PbM#O4kdLdX>-Qv)K<_Di3-g=5ovT1TyFM5&i$a67;`GMGGNaunj9b=iiieH z#U9{>@bBYpG7q(qTu`t*{L2SL;bGv*r&W)W|MI&bp&}6Q?n{~={`&xd2irv|9w^OQ zW&hiJ3$y_ec-#5NPPBg&%0u5bg|$sr16htXpZ@(F#ft7BkN}IzOGR}_E#sF`|2|e= zZdj**w_hKgn)$au5M>0T_4)1i%fF7nJPnxk_PK?$e|xFlAjR<~6E3X3j#fRS?oTV; z`RlZ=1G;D}Mnc7xf14)R_Fz8E)4cZj>n!;Msryl+Z~*uA??3&2hVpN-?f+sZJEK(0 zg@AU`2HfEsZ{`$~LF*?wJj6>55s!g3ZeJI_W#7vJ0PJQ*T}D|6QbLfcZ)Hj;()#|{ zq_XnqsDPu_+^|t1zLCdDN%MLBVpz@9{A_z7FrohN4<>su#Hc*qwpkeL2%4x|!O>gz zVn&B@A6(>JCi9dBYoYcXWytx~3p7M&+@pJ|5C3nXchC}uM!A>04D`)^1kgFSQ(KC! z>#&mv15Mkd4|qvDdwZO>HKDoHhwZVQ6E*>xHB5rK3;z)@Lv7DHpx^x9`@1z>XV9}+ z0Q8wc%+)>EHj&Krg#8~UpTYESbv5oGJ>eu|wtfb*!W;mJ@MlDYTiLD!%H%p=*Qm&f zcgO;G31qn4x4{lcIrya@7(+-QSTwnacL;jMQvzNz>JsAKaLx##i~PQcfCdkxAPO<0~# zX5c@}g^(FsF0>P})&@E)Bs%^|(4guJS*-e$P%-)cu9W!cy?kTTydE#l-{HNVy*MT9 z@*gjNZ%aUk%C9=3XZy(+wCrChSNiVT3iQ)9n|UfG7{~#+w&yiuXsrSd-LRo6_3O0> z$NWW5boj*q`}VAlQfMI6Nh!A+>eI`IoaR5zzIgtUZ?J*D6a-~u#XY~m0C($fFOP3W zBUZwk}+`_~a zO#GHeFf387M+lOT!?-H|*$_}YL64+9=P;mxjN*goe)`@VD($`e)nGJP|C{e8_Uj zNvKRx&OSN%mf^rS@N^TkAD@Z}AYlUk$`~|Ug1xHdl=A-=06d$_mQcO}<>@eY-=H#C zGUT|=KV&}ox!`Q%aI#PMEjz-`pwLpRN8*RzZ@4o6g%B;9eGBDlWX=fH#js6Ac~KZUv>;~-l2$&lpZ*5$^^yaFYew3DRicNe zxcFGb^4)aw?|U!INN;qMEvR9~P+ZY?OFfvSS&&kG(#6NM2TM91TKf-Y2@H9T`oAG~ zU_c(;nf4q*yWp~3b72Hom5GnfvDT62|8kE}75oE;i99Z{o;=B5NPAr1WO6&&KzP)8 zJzY790ow;X)}uSXR$mS>7MMN$4RoFE35e;6zHCD~AS}VvzT?Hj8^*vAs!jzX+)Zgx zU)QbrF|V0g?7o@MajGLAy5InY=A^skBpX7APIPNw$P6`z*fVOnmz z{hCrffHM0QB9?}?Qqjc$YM3$WG@?{7pl6w4F#$aMg=v)h0>0pCu_7SoxpFmaf}&T9 zHu(7$2{v7(Pk;ZF>baky_Wha7}*BfK>RmK9;;%R$8XW-V09s zACpXFZ-CmdGF=f^a@+w+G4veFok>59%E}hwOw07Fk_e{j9wc7x5Kw%z1c!!8ilCfC=ZjIuSbd_Dui;t**id-+S+$+~Y;u5)vc=Tt{~em5O-Bu3Kog9Q zI9bGrBt_^dJDSyZ@6S_bTMN_CV@WmRK29b#-4Hsrtvx^7P7`A)Ns$yCK&PeI%h*lJ zY2pG*=@rYG06H4rL>Jybd0Q4-^9=I|p#7yJ9WGK#I#^v<2sVO-J%qo;V@ZNTNbrw| z$1jq3UNjg5DH0u4=d;4Z2erd4U&upkfaH6`9dHIz271ySz$+u`e-ZZ9QB{5I+CL)F z-JMc`q;yJ4cPO1AUD6<_!lqm4l#uR{+;o?8w}5oRJNNTD&-p#$ea|@G{}`}*T;w{;MO@z>;*|g|7kEJZkGGd2Vf~J`NdLxc3!6Xg6o-W`G zLtFjMzZyY<6!wL*HnHt0&L?N|9cWM28)`Sg!N6gv2eu`3IjL)s68s5hxmP>be^)PK z+FVx+H7=0q^VEQ*l|&AXg;)O%_D+{scQr`8aNEQW;0&xM9@G*_=GKduI;XDM_#9TU z6*x$s=Zfe&^KnSW(^3__ffg-E-}6_N5vd(Z<;~*A=@3Hpk`>~l$!Cgb@`^3=P*q)C zI>$C-RFjpH0rusueb+GcYhHO$(BKzoy-^leYFZxGXa2J{IZEFO!%7TCaJON$50xUD zmoVOTL!v`7uvXqz%=vH?5aO$|mWNkiL_ zSLWgzxZn|$viNP)&1n>n>!!6f*`Z>v95h)0NOZEB>B*d4kjO8SD^9`p+<|gP& zWc_U&ZZbOjq*#4MI4^QH%f0*@I*`x2_SqXpA0a=xB{`}g&@?}%>7&hiHCcf;4BEEQ zuw8mY1F8byLIlolE}KrUyBVZ}*TtqD{2Glmzou(@1hzTO)Usxf1d*q;#G^dSm8$hH zjw&zDvn%54TkRE(>?j|=zmpQ3t}`olzuoaun6TR7kSce7ioJ=qGH)9cLGm5$<<_q- z@1#xT2`PHAB#gD4BUwgUEWf+DtWnr2ai!cqM_l;A)q~xG2$nc<;+=I@xr&O2*C|%L5`G1##O5x9je0OtZJz6yZW(`Vf?V|!172oJW0J97Q)pcQ`g zy>bLYR^JE@7fQSgLcB6^Bu739y>l8Z%VXmA=7G%mhp{L>&HP)KkI_$1X>LBi>{O~( zL&*|v>)*(j^s#1q=$)4|rym@CrHJta8V;6qYrRKWO?1AKsU~hRTKr5{Q8NK#?{i;c zlUy{s7ry>cCJIo6qs24cdgx~Rhu;WpSG)-F(2 zv8%dI#H9z}LO&Tl2_dpa=!pHM%#&zvO&zD~k{kCCcS)gY=G~32o86oFa6I@V`2#AGLWC>e@-!rjFXT#|21`@3je9(<3p>f- z1-#l2`2l7?bt0aJ=`9FvCRK;t4R8N*Z19dY(&kF%Ak$!H=?ar!5406#)7}& z37*Vw!hC_u#v49=>QzCOlhKL^N#KxBlb8|@t`Ss?#>S4;DARdr^FBm8;ez(E*eGKE zs2$}GsCzBG50rc>M=dkF{{^_3ynVMwjYyw$C7KM3zn~1HGfD0kqzDWvVfc{aMxKM; z`g#swiR>oQlWgfii07?qK3iQ@Z^t|Idrk=S#1AOV!$n<@D|a>M{uUzkyTQD=u(+SH z#9N%TNSgHBv}JJ=8>(cmFPy3;ZKM6tS-$}uBc5V8f}#CK!GUNwWbo&r%TB@i4eBC9 zP7Bj`nHkVOKIm@Pxm$>eZegsg?Hth#T)0_^W`y+jemAQ)x-Y1qtu4Vly}VA=1{7pc zg1~D^3xT-PxJ2v7o+&0Xz?Ey?q7ll4-+1Jm=73uDR=k3)xEbATMZI?p^O;J8N{Sx4ro^U zywUnd+%);%yRl^1RQ(+yAcdG2+r`+I8_s%;?5+i5C6c>)^uHMJ^^y#|tP6!#^hHO9 z?zgsQrVet-`-z@fl~0UFP;Qd#V{Zk#=DaJ%?aZXsaoq?i2M$`aoR0RW#8jEpbU%9c z@?Sw{!x2TN73;yViV(`g)kH7ePHylTJ?%yq>r#Shb=S4eyu(L%N_U3Cx}UwKGW^;} z!Ga;2J*PL%!g=?IJE%89aPxFLH51Tt2W*KE-gy?pt>X~;aM;8T8_%;nTceC?ut3B| zb(cV%-?i;g{0)O-B-!6)j2Q1&Jbz7rEU8n*er~HRe8RCXw+o3N#<%7Ct9zdO`P0Cmv6@w{*T21z%lA6oA17?3! z>EoF#*u6!4G(hcSiB-|(iEG*L0D1<>y02w-3VOq8_NVn-hQ{HTON;G0D+P}-Ow2=L zV;v{eWb=qxcBH2U1b^kUtck~Ul;F*#X`^)2qtfUvdZhxJo)tto31}a^qlMJh zBS+}{CC^ZUM5Mr6uI9@Kyl-%^K0%IvjZ7fd=}}F;vG%(3fv+M3zvPw z7sjRUU{Gz)G^~$+Z|h7EIFl>_JraJrW8iofDDAgPzK#fKEPfLs-Rj7#P2|YaUV?Vm zUS)ibZIhfiC@DvRf%jDZ@XgcC7HssMh7YnLi26<{CDc1B<*n%9Ql{dn1leXolH|j2 zqNpWu8Lpb<&xRgt8Q;Zp8+b6Pf^#dH5^%Ve=jJHtVTwqLO0;AlD5^V>nfd$`O4&dh z_npyv7^`J^W_12z00D!1)Q7fMUVSKs5F7qZ5L2aGI3}s~Li`L2o~~5>>SkMX*=5JL zNr>v*segsJ5fTFYb{_xGJ_}e}~S$ zScu#VtX0LGF*vFiqi(C~zYUb;Iq-AiB~%%k7UCh5veag~Wv-Mk)=isL^9}o(7+sbn zMS&yFJ^Tyo8GaaQKjgE0ne8FZjKV(9Ie12Cn*bLUri+#&O1W8mvsawdPcZE+v(;_u z55LVcAROHDYz|JCxfzHyURce`lDB$K*R(sg+RGSqx!h(na_Gy?4E3V2bCgHw-McNE zq{LspZN9!-GXEmsbv?*FL;REOPsQ$Y?1NbD%KR#PhK~+6!Ne@M9V-Nv$76_Ubs`rP zQ7=cMG!Fq|%=QVYF~|VCmb65R2!#8lF9G)pcFfs7%JBDT^e2HPiGG zNHV#Y$`D$sM@^v^0ESF|0^iyF_0;eQ2;Ygu-okTy9oWW)#Wdf6fnSB@_>`DN$9#1f zqNh9c$utb{t12IMFB#X`kKVwJPVf_&_vSKE8sapuBcha2oPnb!AQ6qiY$ZTwh{*3YHe zB*=@(B2C#NcQ*b!ws5bio9A}T8dIwnxjmgg zB+_@?f>%0om({T$p$$@l+m{V^5sTxrp^kO*M!y3En=`{gR z^8mZM7~QCY3?bqt;!$oH)K%+`o>qL9FJs7OMk?XV%vT-Qv{=_%DiA(ZQR(;U<`s9S(1PH-w^6-3J2|GV zH845v^ff)(0i_%50hyWCG}oYHYv=|?*dw!sxk-w?!OXKw!u=tR2-4Tg)9Yg?IqfEh zI-&vsW_^nc+++ zIL2&qE{|9V1vh2!Q-!gT;>A0=X=9D8Xj+v@liHs0Xal*ox%v#L-l7%sq3G=_W;StQ z)z0(HI}sc8qVdD=2V7!dGakVYVAP!t2(b}3hkT54rqvnNUPRk+Lu)-Dfs-Q6ZQhG- z*B=>#Cf|1tj$kVX)22AYpA>2EZh4YQOKTfmcEX3CpF-;R#?_D_?AmIBk zm7j?J!g-2f$-QCkLyIXPvnMFHNUTURNi4NXGBYx1CraIGm8^9N41&}6f`utVDVWw% zB5dAA!D)Y~K5bAJR-p)&W~Hb|FY$JFk>*&&a8pDe(0grqR;RmExdqp6jV{L7SmaIR zF>bP4I99+Tq7og&QWVBIN7x?~-SoV4fZ6xH+!FKBxvP`-B?Lp&JTui0G$#k1W2@4b zmzAbX4Q%QywutZog3~o#Db5Mr@MXCKR#4DgBr?e1m1*+`ZloHaoZKe19A~5*Eqg6G zZN7qSJ@|7TlxYwZlBQ0+Y1U%1geD`=&3KJ|3+yjRBP#{A-z4mHd%x+g(quBX_Q1@> zzyT!vL0^*&FrK4!k4Ram2HM}XMh0o^!!*R4FrS8m{96-a(&Hk;948ED0q3m`vFG1- zunk%Y9eoM^_fs;`g4L=vHW6pA$tzdS#zWwLEth!I1U8V5S%)&^ zaR;SM^TI}Zwd53khU|}m>~}?3l=!FN?+$3qGA)Y1Xknv?rr4eoYardNep|QU8G=XX zvHAhu#%YPKWJ9+AQpfyv2Y1W5=J zPGN!BeUzLCR5hL~@T}Pu)DL-M%axd~Jzl)un>mRGQ&NtKjaPbR5-l2t5ttYM+OGIF zKib=*cLvSUmmZ(2qy;~Kr2mZQ$FCg9_R4fWq^!aboS7!OBymwnS_^2R$F}(HAo@@J zCAUG&9!N?;LfP0L-w0`zXXdZZaM+#r9hvZ5!VwJ?zcOf9(0OiOT1C&aT-Q_Rw5{GG z@%;IQ{%2QqjF0gZN!8Fxi#3;FVQ{)=X_%t;Ty|Z}oMtB14}F%QvV$Yg1ks&1DmDs? zF1C^9l%d(Zin>d=g3v8tnj{qyitYdKYZwL`%AxCt9ubJCgBLHMB#92jE<8-Xb^|INdq>=vZ36!# zn#j|N7J%psJi*-MB126oa;xI}q9WNM2bRCMc|D}|)ak#6B(vOa_~`Nyq;j^4hqXJ^${N}5V%-j-8mCnz!-14PKIB$ubo~UV>tM!<2 zh#T{2YP5YzYiyY7vk#-{S^dI+mGv}hPn6F6mrfc1iS?P64(+hw{MK~GvGUX=r+eO4 z2RDS54S|Gy)t`Ovf?xbUp8t!6lsEvBIM92>p zhu9p~!03y-;k*V-eXTqV}rm{{OKedW)nZxGHEuD=5S6-vRGIJR6h{XPZiVJzazBzil0{h@V2{JALIgm}{A0zJ!0`gP-Au?&kXL z^Lca0oVl_b4t4@(uw*E$9pB+!3zLc|#vZaDy>lsi6GK3OH7?P1WGtye+ka101k&1Iu*Cf_8{a~sig zXFXz3Hdm_CZ}@(Dns11nMIHE6P=O1=f?Og=y#IdnrhJ&f#$c)7Mz?j#JZ+rkq_ZhV zn%wSf+UTpbitbOWmhc<BN}}JUr0hK5|P*hYr%o2Ga+I>hUKdE zvMgdTaxIFF7@1wQnl<*sWJd4N{=*>6r!hQV7KyYDE@#0%LIk_@X~Y43VI``n#}t*d z)*8r3RxS93XKK2+X-@En!o{VH9~z!Nb$e2ET!@9t)NY{D{MEIoC&nV=+=D&jSwAZz z3c01(;H=+(_<{+`CVx^TCV}OL*EzFUsJYfE`0^_Kx^!PcG>65=teCB|=Q(gYFii!` z@}j;v?_Y+%*}|B(b5-jFJgTR82+cBj!#r4jy$~%qhL+GFt%K{bk+e6!8w2>bN3r>~GR= z?AQGe*0xat-Y=Lo4?Y}X;G;!d^g)Tny&R+R3*yMUx{mlG*i-z3C=-kjs0;f#rpl*A3`v@s zSKhoF5BqMx2hT`WU{)Qy1^gVaoGCIXiW^vAo#V zMHHzW-1Tl<0-h0iT$x+2-%w1Qe|8FoaMDpC5P~Ctel4s=^ty?ueX7V+LosRXhZfWG zkxO+V4#bXbtEq1H!#DZXEvDmSmZ|DVR8em-m0*gLTmfI}bMOG=F9EOBJlWKRix^`D zd37w#4L%zEk;G$uC^y8qV_wSsdCGw5Z6DvMD`TaYMOX>YRQ}6R+=6n}^j@Fn}s@<8xFj%U|w)va!g1K;UOmS zJz+3|vTHdBF)$5ZrdofCK@SDdk;;=$p2~0fXwFLYtos(iZKmVM*WmO|9zD%}0$42H zpPd?*^ofymay;!ajrDhV262JH^nx&VFajmk+}+K5a>13 z@NxG>f{;(BY=1&#OUdIKxXVDMUX<&=?ae2T5#Pm^mfORq9+aLq5L_iP2IKM>J^i`c z*tibUjtwrDd6Km$k_9U(nf!JyVm62ZX3|8EZ97H%p=1XA1zz7%PyZW9svFgw`6tlk z!G5R+_C66WbH!;Z)xKg^BT)=eU7Ren{*=>aLvDwY3QH{<0*TF{@#iCMtxKf)4N#$k z9w)JcQe87%->H}EKF%{uO7~PZ51eTJ0JGgGu^2e_2`}-NY5TEDwtNL8PI+e0J!lQ7 z(PT~famUw*ES0tnDIwA3SRYhomnQT@S%AZhw(CMKte3X07lqKWjOPGe-OxK>oggxV zD609OQ#?x0t+E5uq8^`M8I^grtQWgoAxB1jc%YB(A20FyFWw{Yd@rwJ81Of}G6D)3 zH~9MgER9G4f+xit`EK4P?Qh+vM?-1R!k<|RZLR_@JZ6?f$Mu<7+I=XQ!yn`-M2gg4 z+G1toLE92w*!+C8={v^~iJpGk%#a(mWf^b)9A6}K0^{JJljcDMlwQTju!HGA#A%L_ z3T?YdHN@ap1QR6qdo{EM=WL~!Ku6=1H$Bm9;=F`ncDSmFp|s(GNMaLejM>#4f>e1J zrK+lM24GpTpuFY_3uD!%>+UE;4&2k%--7Sh6V$%Cm63bB;w%1M6~hN>GNC=j8+!Ta z9l`?J8R{w`@=Wa5u?&-%mD(~(S|NW@lf3yBbt%a=2~m$MjMt+#xr`##imvFx!0E2e zSsR%#MQL;c*L1L(@&|o%*~HhZC-NPQBq5$gj6}g>tvO%*5IDXhUPe6NLu4jKqCw^x z^nr)?rr-$o+Onow{&xScg_klbFgRZ*D6QQZin}aWWpR3roDva4gbaVqS?YIEBjS=Y zhCEcf%z1E$2gtekNy=S&JlEDc#>t2n7c3UM zG!v3bXO@6IY9J}(`l0Om4HP#GB?w2)dqYm5o9@UoV`@MmAS1p8L?N0$d%iI_VA!_< zF6!PfT!HFyv*s;l$&+hvFwH=NXQ}?=GDYqSr(c=PHZ$p@fK)yBMepu?Z?>am91?2e zSiNGn*UV3wRZTItG;)qGB48(Ya_|kKq3GG8CNenFz(ew{xTz&W5chOMn5veaie`+o z*8+UTVTs5Q3HvV6Vasas@U&{94zLwUGz8d)my!01DsIckU7J$@g z0)VCONJBYJLoQR=yrH(O`IkQ-uOuHg)2eC-^n|&p^k61I!xK6QTWNUR!{(rMD3)Oo zx6FRghquglECc#098dCWCh3?k)UPufGhH z@Yfr*=e@{Uwk_$sHvgsJL{(>o0?Zejq}i@yK7GkfIl2dD(sm6|ZJ$*c&xSF2-lL^V zbq`YLN2nWda3c|QAt3ziRY%v5T>*zUEEo~lKXWO%IjkahrkM=XCS)ZYiENx>xzRCp z_oaReSLYoC3PYTz)&)zkx>4kb;te<{A+Hu5K8u5jwj!D_xSbVV4=N3VyO=ZcjPOxA z^GTe2>du-_eBxjVqk#w3H8n`d)zUioW_r^DyULXc{DdOVOTQu#SExGzD}_~Jw(M(x z;24B+B4|S|IURaYFRH@}{hs=0Q6x8}11betspXoz^RKgd#GAOpm(fIts#z4BAuPU& z(KXW~Sn))de;wor^fnkB6u+H(ZVN3uZwOJ>(2oM!ZjeGig?`}$FBqX2zbZ(*hMCjb zF;4#4N<&;vYCC>3pOz&B&@s5Z=A*Q=U*CF7Evu-H# zK7~HW*%P=RvKT}~WYdOQy>9E|?t&D0wE8IcMmdYd%U=uN^XpUf7v1T%Qk8sLz}j*; zy{~6rb#95gu zT$6ajQZdqxpZ!EoJMur3C!8u(jUvSuan1X7V({5EhJeP29jtLecBAdEpb!&s83E(R z*Z0c2%x;p2yE5fSHDf}?p82L2u78{pxS`F>`0TSAdgsv^rDow_(-(!+C10oBa{(a`Q4{s-uK;jNyUq1gDh@t$iZ{u0+h#zCnsy zqn%+~oi`J|o|%(yK<(FySdu3uYQyJD*cA(%YnXM3p%*AVy9!3~XHQIdJr)9Z6ED8k z`oLJ(&ZiEiBwk9sSpj(5Yw@_o3~=|V9a0OsD&W{54_oGsWdl6z(XtLzP;D63kcfg5 zIcq`+=Yj%CH^gfpS?ks{_jLwlP|yyURHJ=CBwWAwGzK74!keHm8=w5h*6DA&m3-f< zRWQ(UmHCkOJI$Fv_^g+`4OA8yL^hVv_ZA+WQQchEV~$wm_XylT>UWGlYSWtP0U_To z%AniLfYd^UZHe(jhxZ!q0{{R5aNy(FLzChMM8am!(GoA zTrw=WNzR2Td~1A`j3k`RaR~jwsDhWmj404c@!c}Z86z&Ph;6(9%QrKcW$Th`3#qUxfw^r-mgv> zHrL1E52xc+&H1BR&y|y+EHi)9W80PVu6w;$U|KA>$jTyu6<-HIe{;%h5c}KcFnbXy$Xc``1qJ zsm88*);B^BUu=);UM@LfV)#~#&e87+HrDe_NyC@Q{-hS?9<83RmfF4o-}~wWU6JC2 zue`E|oC20zT1=w8YgzD$l_9=1dzj5C7< z%7!NvQe+QZjG|)?+nI0?K<2L~_+*ZDk7+v85@XX)hI_paJ(Q<6C7S&ukj*|dlAnjx z^ty2p>jE_|Vr#8Vm!Rhxo`+oK$|Am#y8qVH!=iA=sPw}%iR9A$G;;`9f=nw`hoBk) z?bZhP*Z}S->dr;$IH8=r;ey@&$;&)omll*ijlJGR|E3A@L*Nc?9gn*Mb60PW=y5ex z3g}k)@pXP6u%oC+)o6S9H4r-@iVlHX?PuRf!O$H6Vg*O7uvrW@P z9Tr&OAEJ7?<-R*QQUxFFDX=xoM~M%45HY> zh>@y)|N4IrB!i}~zN6FX&HMl1hyQV%oN+wN7JhS_`{~U0HGh+NG-A(f8H=e%*79 zesDJt&buk@|8!wE40Y-b+0Xe0E+#;Y0Yn>2Gl~89#}5ZL*%x6eMI(6jQdczQA0B&j z+Yi{|+@pX0@Q;rZ78#I8SQ1754^K%3hQg__zdrq^Z~;qZ*yF4=_GSI2r?dxqoL(s3 z{Qv1f1=!=vd$w=(kB<`uZ%rzU-u+J#(SO`vfSx>fCs<$VQYrjT6U%=L7lpy`z`l416u#n1qQGsRA>9nDIf?+({j0g? zHGU{L3w$;NtcOztV6?g%@B--1U9_=RyaKvP_q8K{{&$H-`TyT}2&6mU54wUlsa${{ zv5QPxtq~(-YiJV2n;nu+kfN zI&tIp8-Kb_6RgA12vBayl<@MaAGpa02(~O%78N(D6H>;%kNPg$zo8D#%Nw&X!fKH` zqq{YtU9byQ0M_`hT%&|>02c7$Cr~|J_3Q!>iUL@o%t2Jb&ro@re1nBX+VK9zf5n9T zS7buQzy0_dG(~NJSN-nnh~EXoMBI=aEahBfp#5am2<~g3*$((;Sp&0wmS<3f^@eqZzbeUl(nR$Vb%d>!6t98jn0LcCi*PE zvi$7oS3|Au&u3JCyvD8&{pUJ%SXU6HAQOep-*z{({7qtc{Z@kx3AC5-Y=8%^UI|1~ zRxq4v-UY1EU8Tu95QL9^WlaCgw+r)(+y$``EbCdLfKpjb{TD>pj2SWj1R|=?Spb2I zgCyWN?uP~E*e7EZJ(TZWcp-Ui!T1b{%|j4$kXqWfS9DTYH7rjzcEI=&v$0PJz#xfYkyFhLdvC*?AyQ9zFC)7jb?*j`3<@#$-3(%M| zz`~g+-T^R4mc9<`(*+G&hIp)NrnFgZl~p|ZHToYe0Rcnbs|C3w6T9e(-US)x5(qHpeCbd+8OH?L(X-((klJ?zNL;7R(OV zf^$F=Wzzj&II7Bun*=-Dwujs}NtG6)wYeVTdw?ml0QV2eIu1mjS&Mal}8#e&# zoc$cc7S6^t0Cpx&YLu+%uoBMq44Z8p1bXIyUKfMhBdn)Y;3dkddld7wf_GQrIe(1H zlA^`!!%je@mrU_`1cN06U0N<1-GcRu;e;VXYV7^AIEnh3m-hXkI z+c!dMvwo6){B6<=z@y*M3uKkb!pB|xy>l1(Rq3T@WQeL|9^EXca|C_kbf2;_coBm< zlzA6jC)T1RF;SpHnxu%oA$raopB})znjI#k)Q=4~;Aa8q_KyjiDQQ*HR(hc(L91my zyOiZ&=Lg~7Wv}~isPsbBdqTEAh4Gb|HcnqU?7wg1j@_F9@;Q7y__ShWa4HZW>Wgzl@V91Fs4qyy&?c} z_j_@qa*^X|MKBH08zzi@!)#!{p3S|!lOm8$$Z*lnaw=My66NKW?e=v5R+NnQE)NL% zP(hmYzTjb3Vb<4hUgyxFSZ4=PNDUq%M^NkiunJz{k z7k=4I9`W1qNQ?s(h1s7EB-Of49OlmjmJymIQRKZ5htKfh#V~1u<;97~(!sPW-ORNE z<3V)oA{_6)hlK|<57z89Ta1?|`D@y({XIeNA$7HD^Oqiv`LRA2B|M2k8;m9lgKDfF z)lHj0tH|o}`uhUppH0Jp`}n|7cqC9q!0w@gnjK-Ca7cU}#PrHB0HTmuBwJKe8l#pPdKe8NLmN-6nPr`0kjjZDV2`7y-B zveoq`H_QqJT27~uk2^V#q3)p^Zv)PE8&MWleOzuH&uMll1gH;J%OwhWI8dMO|+-jJyyz zEB0P{P?R^*s!4me0r59M2dEi&RUP6Pe!zATgY=pbt4ID4a@HnJE_Q>?#Pp&E)-BzQQEHh)^&nslUyj3 zi0KHXcSg%2QY^Tw7JjvP2_Q*BCL?D)j!#FXz1jK#vT1D%=qRdFdXd&%Z3;)fpW(~o z)s50h8%)&vNrzP{jk&YFd7lmI?fbD~M409RoCgqReA1o!nbpGDGp7H>BvxLSZXe6H zb5)N?kZ`xUh>&kCC2Kk>#c#&NI&Bek!X3!&_SLnySaOxngs;O`b)C&&clK5h*3MK` z^ofV7)0f>xb`T!lMZSBS1GIBN!o8=M)Ya#}1gb37dd&=a--Jfy@I`!eeo`jxl6T~i zV}$P9XOcJNBHk*Y)jPR0$@>;CP5AxJOBqehJZe+W4Sc>yf*XCPI&4ixD$3`p?~!3s zkV~ucW3=yKjqvTj3&Pwjw1Eix(OtIEyYal*n62~vR7X^H$E<^NZ7}5y3Hs>MpR7)= zzh%UsPr_KQp5zE`@~%I+yUJmAe=jfXsiYmN(kWyAWNC~uYuHG^avc3=&9JWCIAJ0V zf&`WEym~0>{0#5C`H^31*>u9e4_^W^?8!8?-eoArRFXF0US3v;z@zQQVd{w#MlTu2Cwk$hV#iSgZJ$hylx z6Q*R=#g81m_HpTC1+uQHsdeF_NY3Pf&aUlo@rLMnOfDs-%X@(FFKksw2Q7>$kXDb3!QvQN+i;Fe(jUr+R2ybw3TFYYGlriI-$({c-UyjgEsNdDiaPm9mh1dcZHZO3Nf zkFZev8(wo9Bv*vp=3L#0O2C=XcSWY@ND^M!USbOwb5fT3HhgSG&(O%rqLd#;*!jH6 zUKq!~`%#Xh^2?5~ThbeSWnv?BnF^YLC_C$?E+?QM;3ucLq;EKz6BU35^0)I(qHXcd zmgF+H&E4LJH3(%qvbenrSG#?&)iAAnDnfte#=8{0%~^OhH%;|KY*^eV9xt0)SHM*9 zcrAj(%{Nd?X1R=U^xTvD+*AFr_#2eM zMPmdrFEjapjV;kEXFXE(-)blA!<-UtPV;Mv2I3m*cx~5}%_j5@IwI#zcy=SJ1Z`q^uibKsV8*SWUHR}nD?*V9-Dgxni+_yF+sWhw9ImMR^gEOGI`@^5RW-}7Y-CGVr5LCu&)oZ z7eCuO35DJPgMLdUk#{M6=SP+H%dU=h!0&Vy zLr?qY-tRH(ej>Ki_O2+pzNO~Y)~ofiFQv&V`R{|rb~o)OB3aKVuw5+MI%isMxvGy& z-y65-K~Ih%;vh-7$8WCY8e2W!G>5RXKg_y0`?7uJ_Q=PKA?zU|+n%jBeczicQhryk z_r)qM{Nu_zxsftYZ_*~)p0==4&PcF-fAkb0?rM9h1~o!Z<`Q)F-U69oV|#;WIY!r9ry($gdrDMW(DV4ZZhiT@)cslK6m0ieF+@_yM27#hQ0&ps zk=>&(*_ib>>|=k@SqUjDy&m!6I>2-OE|y9|*vW|Ln<}EIlu7hj+0eu99kZie+GTPp zThI+)pM8n$#QS5B>wRZ`V%i^c`ZHwH!yk@Iu?Lbch8vcEs=?~}=bN)jsCztXFxB&oT%G!I z_Vg7jZ-@MyTATHIEG+!I2LmGkNy4lYb+-(=_^7m`lVh&oyAxYIzXs7GdGVBx9+6P1 zIk*9$j13FjW4mzwB@RdV6NE-mo4b@b;z}l+IkpbnF~4Ig6WjZ$E_um)7Gm+={gRPU z`z0ft{J-2Sf=-5KL^A4XM=s03CdsbSXjd^Kcw5+)8qY*qW7ehrzB^0TcV0nooI6y! zApmEV+j`%8Wmf4_=qS3}`!fzEzuz;a+j=nwoRior!}3|D-!r-ci^Y}SWT7p5)tgR~ z7>aM9*X_vEs1!q|{5R0`KvtrAackr=_#Mih8K;6N z{B!Bz?6mx&$N8@02PjR#-Cn`u2O=Y)b?-F;{YeZDGvlw_Mo9@D?7B(i(kIlq`@>bV z4WMf+xjr64YKKF7MZ`YNaY7n9oBd+L#oHw8s}$~jde`q+Q7ATAT>BX1)2S5MXM_0L&7zL`(A`K-35 z_g2iW!tSE4-tp~CxgS%+BdgMXH5rbRvPwDp;0>T;>mw1R9IQ|F+0~DFNu^=)Ace_J&-TvT);~7ycSlQK5$-1+;7RkMnriQ4?p*Y< zAR-w^WOsiGy&no=99aHA`DZD*2Bb&QS4~b&(24SId+uULx)n1ul2$g;yuf58-hO#L z-$M+YSje^WI|yili-~LG@Ts8L2W^9&{Q}x^#&;Z@aPrn&l=W9PhA0OZF``k^$bUJ! zi>nNthR7&$%<1_;B3v5GY@0C3Jul{vFYRVJjJb+_H!wBD8~?6(HCe_M+r(G41I12` zV+&=SVLI>I+Hkm?Mhj&FV1hzRwnnvcGuMQ)4v@si0 zgw>j_c)J`f6)H6-({j%bJcg|UTBlB8)njjLBG;R&CZ?k;4tTCFBFXJCj`Be<3L7xQ z7!llf%c;LVP{)%S4n`AP;w3`sMvgy?jSub>7)4k@9Pf<(qVkcG_{?@t8z$f`ziY-# zI8`rYigd|b zjLE{PD~{jkVk2*a)iykAxaHn4E>#2^mz&q3faO!Ef9^`TDL*P|=14f8)VAqD%W}cq zx%xDcbMVRYIQmcPp9HLLKQ6R#UqbtM=>d|zBTfYg!ZM?Dr5$WnEbMI{o`I0k&?kJ( ztNgk9|J7qnnjiX4TBF$3BQ}*j)E+HuIWS9Ve^1>$qnh6C@;Y>HUZ((~=GQQjyGUx$ zPbvt7%mHy3H_6nJ>Gq)Sg=w!TPuL=UMj-(yap~Xu8rLq=^lFdwi63~c{Yxjq$PO{e ze4C6~3lP?k`cazWj~Qgz`(isLot8K>w`&dvGB>J}@0wiZO4aG4$mBI%h!iLO6|cFw zA&{~=**6l84CH1sd_qO$tH>!Y8F_`o#AfAB`HtBN_O z>^|&N=P-%0~$K&|7(1bF$z0BW2)HefIF0 ztZu>TPPm%Wyqm>1;=7SRze~mkf1mm9rg-C&jm(d~rC@i*TI&9sM zp|Sj^&~{bBY0k-(f2iI=qA~B$)lYw++=VmTUq9SFQZ}dn)O`;2=XQk1|23HW8vk=J zZPm}7c?H1|c|H(*+d95(8cR1p=73i5rl|Cz7D&2cRJ5}29M)E__T;CsrBKZajmk}E z2k;NJ4Bm47dkXR_qv;nzXoT=d*?t83BJ-Llf5*D~+wk*WONQb`!6sBfQXeAiCYu*O z=_Ka5YS^y(+-04T6r7p%o@D*qNp+F9lWMA9kd`?l&#}E^AIw*k#c}A(rp@j8+!cb#64C6zW^G8w<4y9NGGSS)1xVn@5|UFqq9s69N1THZ9BDZSlTm2DnB1xFw?1FSc> zIF+Z}m%%a5j;yOn-!)!Uff7+raClWG>Se!fI7VWWU|we4v0I=9D3^EzG8Ol`@MV=o z>UDgkTg-j+uk#{Fau_yiME(zZZyi-twDt>&pwb;%LPBW-Hb^%}ODiE=0@4lA-Jx`Y zfQTS1<)&fNA>Aq6-EikV?>+b4@s9J|bH4xY7<(}IW3yOmt~sAMpXd37Yc8Y$hBR8z zTQD}m|E&)lrYv`ekyJ8@Dxw_Z8Y2k%7 zuuWBW&x<;P#*_*4ddOvd(KMLlaTs6uSzY1FE((#KyWWyKrtj&w=R+k9$0^!`x598V z$jAs82s~Jq-+gwmnbDsx)0)=0JehB8mmduCQF$Ctlf^`6P79b8dII|K}8n19L_2A0JU>vo52oiJ9J}KR=Cw zRYX9>=@=!5qkf+4FnUJ&D`L(-1e)vV*UGsZJ+0~dM|C((+?&vQD-q_j9oqV; zmP)2Op;YIha^>{*GXszcMNL0Ta?~Pb3+ZeeTEDED4EME^u^_KBKN4fb8}umdqE5&4 zKWE*zh$C0+O|Zf%&%TJNirJd;1v}WKbsQ# ztOTb*D2+HnB3P_MHWf&Se7<}DAbP9_mmdM02l*rt#6HMQR{Md(_;W_w zlPu$uzQ64r4=+8AGHU`?-Y_bLbf@9G8(*7~5iAYt&)(}?6935pjy+O!HQeDgnj^_y zL`e7I4Th=){9Vsgd>lFUZc^>b6_GZh9<>#BeI@tOoZuYbeLh>FG0+=9uYpYLe-0Zv zuP%z-u>x_zTzxODzDmSv&Z-R_BL0_lQ}2HrSOdkYD-M`{0_piZ{h_@KiNp%( z2%;ePx4n$m1L(fi@W^@EVoM~wQ=Hfw8M0?6H?wxH^vocCj79HEm9SS{Kz;cS9yczb z0Po^VvKqy0Z?nsK?mGF?xZ>i1y=AL+^z(J$LjiQcs;~FF!t#FEvpWB_=`&g%TX#Ym z9ZTUg1LqEv{|W-4YFd%g8kb$Dg=LP3oF3=u6JxzDFyWBri`Tj$QnDHSaEfw=*m6oQ zZZC<9$y#>d#=Yd#>e7E6HfBGL4#O6}8YU9ALldKnf2}zd+-$zb>Gjck|6>=ceg!Kcd#Y@gNBX%^(|p zI`z8@f)#d?41kS)_pGd#I}Gdl)2a?O<#kQU>cS#WN8U9o%5 zLp{AUzeg=N&nlXt>^T#f@*95^mBWuA;MD!wa23uJrxnj?<=JPl$J|%|7C*z_3IoWA zWrK~S6`@I$PE8=u}~0y{2Q7)E=YJ#&CA0@yF}amLh90>qVF8mP(W; zuu6rw9u4wOKgo?H;)SlArDUd^1Tqu0{(Ce)vUF4(P(OC2?-i)w9#DE(A@#=x91XP4 z*h$X$#XV{&YK+Mp`=E)QAG5&w?rM07ypilBL-Cu_= znV|C1@z~WTBE?@qgGh>}92}1(YQWP5RZ|G2P)9uJD>b3nXJ~79GQu9rqVK9ni{E>> zo~4g%@$G;%%Xm*rFF>CzPgct>(*G9 z^+oKVZX{QLG;82`+r_mPEUB-$@+&Q^TZblT=2-3SDunP^mC(%}eTDba6^A)2bp;5= z3KjL4f2Ol0#BlXz<#^Gz#ZFbXhbFDjy5Yerdvi6$nAK{_87g+TN6Q-MS_}qFS^I)G zjW16Ik#C9MS@j~!Wb@RfoAvzP+ucbgY;B#a-*2u^Xq&e=-jZ53M1>e2Dai^vbe4O# z5OQQQUW>Di9x3TnYc5h^CbF9|Hu&}}#{R*whNFG43HLw3MCX4G6Mxn<=N3~6*pB>- z=9)ilp;M zVe{y7Pi~iZ^LZo^eV#nZ;G{N4_RVE58(xdjv{o07I{wz+7Y;KTGA#2`jUM7=SvZCh z4Y$MV3_*!6iE)~bjagT@wRzt$>w32NLDH>v>kQpn%JuX8#UnYaNb~W@oT28pB-61% z5q-mkhv*UylZLaqnU+-rir ztXP#?{kafGUSk2FJ=~bjyP9h}Z!!{y6dqrq zK{KJn-}3cM7w!IjmR2|I3c+Rv%>1H{$rt@W7I*z z8I$PKiEm2FbvZF#V>)tLGC6x_^kjdJVri8hMOC~O>ugRQ1c-85U!0stm>Uio@9;YN z<81HGiDP5liK=d?Iwi1}%!yc2tev$X4i z{7Oh0C3P!u&^oSSykOPipM{q8>X^px@&=*(NGw(W9*U(Nh#JY;mG5Uh=9|bAMqS)l zzm{ul;*|cTw|L`+-7Ju*tbXYBc>NW#gz!_0l?}Nx#ikuB`p6Lblf*nkO^=)l(WB?- z*VA4P5luRT-^`93^%kBA6%uLbJ^AyFC!N#8U5Im&1a=NYJm__b1f4cTpL zi8puMkg&@%YqUB2{A$GouH~0bB-D){rG14>O;^qVwMDjh@Ph>58f0B>zt2MCFKN;I z5!Uz8#y;Y!XN~INcWI%IRJ)QSx^Lt4)C@s9(h@w$!J(KOj8Vl?i%)oI1&o7NA{2P$ zUU}Yw{;~27z-lddZ0YkcvI3bjB^LgYSI>y@Iq0p}7T&#!Wu-NJ#N2~qH`(DezpZzU zzQAX;8I{g&@~8TUnkf@d5JGlM=_yOzkqQv!8wU*T0|n8ZuR2=z!@v63!g|ZpBnzEz zI`i)mdT%3C3&~s(vN z`PBPowoM}PfS*@>>n{c(oDLv&%TYFWzgQo?kB|#*;`21S7JEYR*3D(KaF|mh-8DCg z8-4ReC;=X|%s1$+r~-ec>mL37C|yI`~Gx9l2P zE0>mzgYimyQ>dK^G)?(S#cnq%lv({`rwL}U4$pNjpT%!wZZ!_ECWH^g=A}mx<>S3d zfxQ(ES?}H=+mvCe%Y0zn3<$8~-UqXZ@b+YdD;vqiW>+XtA9tZ|$Wt^Noa^6uFDuN}ap!Eo9{0B9)O zT8X7;(4SDB(CWDb%de=H1IQDhsT)sI(N!Zb^CZqk5;(!gr!JTlX48idAV8%3)yA_= ztX&I}*$~)C$(nbhC=tWZZ)a}UMv>i|wXNvPEd0`QkGEsM60KjM|7DBNRFqp!u;YU} zAauT0@-65g8poJiXuH^64zmS36gBDRGnL(reGm>}I1$1mzKxfAe%*&N#z43Q@;s6I znEuKS+!l-Q+a|MEQMHUiEmn%Y@DRbj)P`zx@3|uHZ|^St*E$VGbi#K}?}7gO_`uwr zU5Ef-zJDV?3UVE#jAfSNG9=g=&dc*npgHB z{<`vO>C;2&%ueA<`qQQ#_Q!9sd)8SQvSyuq)k6HQsQ)-7WBw%+B21OHbZJn{K0Kf-87 zDA!-66!(-@(Pz4S6`5;xwf#k#MW-;o;bJ3vNOk0|@dkDk?gZ0;dDi~$yey-)KDM0T z10vfbL9=ZD;XtbsO>L6le!qIIO0=`l{9rNeVXJ*&LH*#x4d_(5zxZpmb?MJm^qdjr zXqKCtVb$_n=67pPF#qAwXI^Y22LKM+sEjn#NlJsLc)JuU$!nX<7~3jlMen=WS8y=D z^Fe#p89r|jW@dc8B2ws)f5;SQXwhRfv+gmu{)dzFn?m)9mOb(L+$`rVHZxS!Xwz}O zeG#JlLu9bX^kBKwv3COLI{Jy08b$V;AKocEx>89!sp6xTX8G~xsa9L&@6r`Hv!JcU zm4gWD60uujE8G~ei+)W*LT~Gmrne)LmEE4y^i_@!%NfuEpZ0g7{rT%@pKj{B;jAaU z926aCO}7|R35ey-M2-Lg+Tm%aUL1esU9!FE$l>8-;>v#YpfM!^61b|_mIxPeeQL`R zC7_q;YWV3{E9y5B&D$r&=x`E7A)e0yLjj$~Qfo9Pj?OAns1cTUW>z;w6Dgw}$sW3& zSNlKaR|pkTF$ioebJx@<9B9_f?b@uz_4T=`zb;$$6I?j?#)IYFaGbHPGrJ+hL+bIY zw?$P2TQdUTRZzUKxn(MNhABnD@C2j9lH7J)U~W^f!Q(U$$3}&rR8r%zqq>vT~!vzP3&@YSicjFe7>f3J@GzMhXP!<%WGHs#%q4L+*7?6=iR6P^!a^&QiVd}`6VNia&;8Gm&tgg6h> zWH}6Jy%t4*OJ^6|!_#JBDa2t%_=Mr*_gt6EKcdG*4hhX-2)ln>}VH`YMB)p9%-|4^e%3H5# zs2=`{54-(C4ig3PI@Vfp)J6Bzg*a8#DV9ATXd3j6u()hlh?)*k$L=Bv?htX45~9{e zZnay*|KdX6>g;HUYx!4{l(V44AC|X89F){9hwt3(1N5}yMK|5jb;ZN4X++?PevJ6a zX@Zg$^;M{$zW?HbDJvRWOx?1{j`-*Q=HqvWA4_cs^K^Fkci%+(V`{XnSgsem6mUyemF=_18X$0qoSf?TcHKd=XSZZ^PhZK0`anC?D*dxruV=37{E}NB*p>mLtvhH^FO*ejUBQd zFk)GYVgb1dhA;-{@*Q!{=ZTFKPJro&xw(YCNYj+of&~1)z0-i+@+EAQV3w%!}afu zk~)A%2Z9V&4bjdF<~u~#lP@EPP~4LS0Y-q!^>mGp)3pzgvfdnoh_KkWRHhNdc+9SR zVD&l9%t4SmmWXXUSIBtgdl0PmWx#R4+`L=&xhtpzr<@75zC8Tcni%hs+>R(4r`%<& z!B6V;z1s5*s!7jgoEg#(ezYB}t>X3z*Yh>CGjn^~)s~y>w|JW2yLLWKQX(VW^jSJL zfUm{EJ!{u?z_a!Aon9_JxnN_j+s6Z$TYSGuy6k$9h1G)Dh~_+0_L6(x=A>4Y;-SE# zR3DAn$LXhXy})`E>%Nu;i||G5BlZKCntn$IiM|f7XQAytxVS8rxHecG2GR=}=DiuH zX!yDZ{|4~Whax~VG@L+@H|GN3t8jB#>Vp;Jdvy9a#?$1mEJUc7b&z1;9O3Z9i^$na zJy)4K09US(sR_re896+PXP)=(<#vcW zFV@{L>%Lp3KzMG9aD8pZOB)qWz&|6e%>eAUZx@$?K6=GKh^E6a^Eh33{W1RN4KMJ& z3P;#xwaEC2IgCes;nJeCAQ@yj@2|WHpuEYhMfCM9pIr@w5CevxY&np{2yPmcl(`=* z5a!z92SB#)Cx91GXread0$SRHh)Qbh83H2@TYSWpEGK*g+dyQw%VId6lzCTyV2+v< z;P*ro+ylx#UQa?GVv0n78Sv~^XLfsyixsdOf0+r)7skKovkIi(BH$}xe}>jF-C5a- zspQKkY}&HrNw0)8{ZkUkA`L*b1w0t|eEwSnBSd0l9^B zBH*fa9RbW3JUS4+{o~jS@fq^|gU!Xb4dEGP^8)jZZ9wGyX7NRfW@Lr;jxi+}xz?S> zuQl67-RGzG*FN{Qbk`R^@3=NQ!shDc+H8@gJk#d6?04`M ziQ(u6#xK&1lF6umBb0nOoYc|;2SJ3SfIl16UK1Ymf~!<%Dcss_19>a?#x_;yo?1R+!2dWRr8 z#&04Do&Nh@x^d0b))M2Yy&jrItXRj*9!9&e09H=HHjD1xzXu>S2u$?eY_>e^dvo!V^nATRmpc4e;cJ+*$2NIb;mYoC`SQW7S z*l{Jj1L;2#q(!C*&OdeC1iPYZ0Xa#`L1^uuEC-zJWxTsJ-6@edf0g0 zz&1RJPtTnn!O z-?szQvf{ZOr#lVHb^yL$hJYXgt@*+}sB@T%Tu?A%KyZ@PaIUX7K3#2}9A+f+Gc?c0 z?d16lUn43a=;ROw|3gb)cgLo#WtyOaZ{6hgKngj22!R`9rYlqEeP?t}m@{4q zqp?A)2EcZ#n9j?MKxpOZAgbXxDCJZ<%#U@(WjKdWDXJ~^h)WuFOalIsy1|@Oj%6_3 zQCP(;h`}eKiDq?1?4Q5-X#*|R2zszGkS{YQD8Y;pY`6{F*Iu|%|FedaosNiJ8ZwCi z)Da09#nt6RnzMp~OqtS8Xj)6&o+H<4ahqtFw;-%rmKqcNhixDd#E&kT zfVoKr-~>Z@@M#@c`xxLz^( zgl)Twf@J5yDPS56Ik~nHi9MFd_^><3cGhJOJXNN7#!X@_v_n~mV6*x&eM1pHMT!XvX)v* zBMMtVaWsw0()!|E(8JTf0Jo7rKOy@Mojn00+Z>|EmK zBP8YTdFdfJn*ICNOg)4J&=EHFees&+kAhdc@MZOC1_qU8`78RNUwyyN&|=*Cj9V)e zE;(%a(QX6#8yqYWXUEp%Tz zEo}?m3I){cYKOqD`$9)u&nw|&OvqO>4~^-XVW_v*cnYwt*j0BgO@`AOM$#UCCu za~Kl9UM=S*yOFGTwRk1tD*DR)=!U6d_TIz1QolxWgi2t*LDPm5^Gzy>X)F$2IXR{l zo5xhIcrE@Ud)K#2!|yGaN_UU?R+w zB}eDyY!z_Ap7z`|-ckrPU(1C+QM-NO(q>=Gq6GgyxXLXFJl+B^#w%Oj#1wS}ms8RB z!6E$1B@J|L7N0-fY#!wa`W4#?(EaCoo3ugc;vE@XEGWh9vIsL=9pEPSfhtZXQ|N|G z2*gTpkovKj3ctw`-LtX^cNJ0`{^EjCA0L(RXWQOr1soarZ2Ng5za*bed$ec<<OGdnNJhb zZY1m(VqW(&SI~E@wI!^nGIgt--eMkRhgJ@um4EIXXt@}U)CUF>S7sOqAK~sE(ycJk ziW|2_^c3mNZ(v8Ar8{M}o_B!x^!+P(^W)VRlh_B2gz;UdQuK6RAQSz@!lqqEx0W`- zet(w0UiFc8AE?VNEdyyn{fo@2kP*ri)_DBl9a>mUe}YMu9A4U>wVNtu$ZOQ|O%1}y z*)Lw;Ze73ZYaX3Ozk4#?Xdjs;gv80q-f4JB=t#0{<^Zq)PZh}Iw5QdRMUL*Q1KGPF zEMeq$5&Jl4Yk7y|S95Q&sAvbr3d8S#%u2djq2`WspGhJNr;~%Ul{iV6uIYf zpmiMgC_Q{SO8R_%b!Q)XnsHu>o7#;r(XtT`@{5E}&xxNqx!Mb~Y&PA?1p=%AcI5a5 zc<7cgaRaf9>>JY=kO2`8wtCt=*DI9&bgzTk+M>>?s>m>|PJt z+zw@FWSx5=+KW)C44gN;F_9UGSL3SNpP zl}t4sJa`~4q2U85lP+|~+8w1e-5}NJLo=bR2>q%(>d2fvW_!C7bYQ)xd1@~VpZ$p3?KkMY4(S(tMmd-~xK`4Kw0xjq|LJy5>c18Y zdr8$L)gaqoc^(Cb-)&Df9skTw@dY$atf;U?wkZq=Xw}a1uR`k;#NT)Fa|y%=u6G>s?QTG2%BHA*_>cz5hnEWndl?Bo*4Me11-?k5&W#C_)d@XTFIrBCk3{jfT} z`aI*>!uH>oC4pd0!qfTW*ZDaPRBvt6CWEk6ML11V%9VSlNpzi5F8*enr!5_fex9fUAUU$gwJ926FKV*tp= zTkPwnSivGBmtrlSf2>r8_Z6HkK(AEcsPpb1m9qIY!(8$sY7b+=bn(M8ZySxHA`RXz zzV#DT07?62yeucX{9Q2Mi2&HglYE=4uYd7qVLmo8Jbeu+--s2%&w25}J0bunPCouR zR73FcT=QZ@bpfiW^_}I6^Qpy=L1$SPr}Z#*Q{H-p%^NGnY{id4E31BjwZYtK&nl|=*rNLS3?t;>G<%+c z`(a=55;HmBZA292&Ie3C8i9!r_Dh3~mjNw}66mj|wr>kGhK;*&*`zez$^Dj~ENgj8 z#zLy;+K+n6Z_JuEfEM6F5?ltSMhzjls&nTyaeq=AkJnS#9cAg2He&K-^F6mbtx(ZC zLK295yvpMvpo!XaK>fTEE+rNFMk-1ggh6ra2hlTLBxtnZ4XcED^p>X$eLLrBL|&wF z@5`SDjm!ll~ikk#nUz-@*tC$T{Zrp=?vHbf!$=~G%0p$|) z`}&T->?LjKQ|c|b1*Gr9diiK+*fEjTYWu8V_F}=shB5-&*urW>=;RnE2Lq(~LszaI= zR*@Rh4<4}55x#*&68hR(9|!md&Zg*KGpj>)++cjDue(Q}Hi6~F8e^t&>#!k(KaPC{rd;po8?ox~cq2Xzyk48v7{hMc* z2dKfCECccHth$W+#r67*Gb;?<;p%`K#rAEcP&{&^jdPLq6s;W&)Ib?MO}mJ#Rx7-hW?2Y z%hI}C33i_cw7q6wa^A3>_`$)7LSYW{|-+lz(b7Mw1^tImSbKW9k8Bi zIgDiYGShly{mHQPF&T5+VYOkbxSQ%gOwP!F(tHTi#6JBe_mThwNdT3g;^*27G_q4R zTu2RdM1=CX>TYe8V0kNE0LJbEWJ;JByDtsjs`Dh@%Ka5UpBbA%WlJQ7@$7G z-J8V=8}gxvEL2`L9I6nG+dY{op7u{+M4FnHGB?vgGGm5pDw#Vv1{gQd40=eU8QDDp zMrxutFGatb@rJm!K%dZ~JIXZ|K3rwVv3S~Mu<R?l- z!PrIcyjXMa=X(=Re+|0%8-^J-B@)9<*w>et((MyN`&0DImEi<5yU%aEVWbfP2CU+w zSM1X+b#{3F)B;dk8#%f6q_5)h5<%Aq>_XzTlDVRO;IZ~s=A&eyev3VjRzi6c+dsh< z%+V@~AHa2N&lX>>Np6nQS2$jq~X(4cu-61?mUy`MvZOh{uIuh)o}lm+N&MA2S!+{&w@yT1$V-8>4?2 zWD{Wj{WZ#mAm1p7Uo@8La(Q0D5Q636(Bw8|4Cot5rI`<(tY}~jM1Dm8_gKl3VPjv}YWgN~R@ZDn^0rc1e&z;0ax%TG>T(4y>un~`XOl94U^T|#7ZeW`eg`39gu5J^*b{_^h;Kl4ljHO4-L zCB5opoA2NAGzWTaMYr|Tw{Bu&C!BVb3B5X{*6z^HToOE;f27MRlDN6|vTm5{t{AU8 z*zEZR(x@YBFJ>M0)4Z>OxVdfcZ@hMp7D1ZC}Ssl9#H3w!Vb+mJ9e@77;Xnj`M79G+~KXHSaGdOPL*b3;d0hQ2pJ;y(MI7!n|9F_A8fWkx17&hzHHH z7G7WWI*%*H4lZBVh3pnUK!bm{o zewWc%B5;e7-(-Gtxxq0uqCOwI)V7%Vu#7sfFvw&DL{Z2*a7sjkqo1|ZhPochz$LN^ z`&Nps%y{<*WITpt>4P?SJWGq7Z`iTRlvYoFe39@1^;H5EwlA-L3M;*Rcv-h}^pOey zEP%c~V0Lo2G_{$%+fAS5VWoKUS`~*$zg!q*63z%&!5Tr|VQ3fphh3x;uejS`sk@Pt zIrg_asE4Fg*Kz`s_b(O8z(E9b$dX;EWAo##Lr&CYy=+8F6!+)5m-!1)gMOUPIKr2F zz#8C04q+rsqNLW?$z zG&0#QQ0m)&HfLidPj=axx#AQ(q;MV*Jzd6b#=oi-?hMK{$q6J9Z6H5Ae>=QWt#V{J zFQ1<7YOqvCfuXlsE%eT1KDL92fwRO2VmxGY!^@WQaQ9Z4o2uL)T*G84@433$)y>o| zuP4h6ADcA{rSau|)-3>?m0_ZP$d(*>k(=FNA4XIg`LeGP z0~>N~m)sq8TIemZB1O)wS%}+tPe&p3Xj=90oKBKu74*}UW#6)^2=xJT-!g2uBP%?3 zabyYR51kiayu7_?O!lShY?qVYB8C3Nqx|Z0sP$N){e3N?{CdL5YqRVq=x6F)nZA4a z%$iY>$dM|azF(@=PhG>!n2pu;&6m&?eFM)~9Fu1GJU@BaN@vU3@2#t(VmGJcjYAGE*8%cP359E$0NSXVt3N$UyT!07w6B%c?QesR#NH~3fMK#iO4FgmC_ zlfgBz8qqxjC=15nl?KHO>Z1OHmq*Ad8GWK1_8&?^Pa}o-u@s4{_1*;p6y`@zNWF~V zy_noORypYh#h`K+R(=}Vf$v7pfleUNi`SSO;XwmgHQTPtZ=xgU65@tQW_uBm6W)?|UUjKI1xhpqWF8i zv%~2;?2KP{rKe(tviaLLU*ARAwu78JRgfhBah3;y?GL%`24vW zSX*iu^w>lv)i0h0ei-1JQ z(+Hoo&Dy0Ivt+Vbc$tl_P4X&~7+pAF8mNeOCWOCovo`KJkP`JN_8J6&3U5|nsgCG; z0tH?icGW%V1v2!vwMUjkUk=z`vC;Ns-JfLc9)9HKbXYa?LPr2E$jaX6FfPd9;JkI1cKI8_EH72>y%4XQN*0c!2vav4Ig8fu_BC1<3 z|9t2=di=vJU((Z-n>q`STJ-<)2)QF0c*d*=X`78k#4PcUtE}ZXiQa6Pd0!uR9R83w zd+zMKdPP`yzTknOogue695Vc(nm?S__9B?l7%da+#aa@7LA9<@wycE(!Qi*<)F9FD zwxDa5=ccpZgZG?M9K-q=$Z82B}S8nd=iNo z6Rbtic-^6LAN=ClI;G>>N^=@gy%pk*G|K!MOP~`egI0aPEUkp z3Gw{%MvX|BJX6(=ta@km-(6}8*aFqdReE;6^q7? ze;|k{yx&%nQ}8CpV5ol%H>5@Qi(3tD2AO7Sg*mh&%cIyQmic*S|H7H;wk@xn&=%iz zOk&ACWqIl9ZAH0VS+znB4JCJ)k z{V;;O^^{#xal2vJdfzp3#QEhhkfPEi{;-ptC|qnUT#!;yXa)0QI#2)Xqm9=;OR;W1 zV4K}qo_@zg>n)k&)2Yt%6Evpt&?W@|Z&fyK|NIl^5|bMwx0OF=Me*XTml?sDF^UDD za-c)}`PZHUMfHez!-?#Eld&HS4qe+n70#_KZt3e8zotM3*G?tcy4pcIRf=OUC;mLy zbRHbg5|L)0UOrc}_ReerXA`ElvdLuPJQpN_sbBGEY2?b|K~Yt>@x|juMvS?!#{`;` z!JW+K({4v`G3nw1M>5}~)nBv($jQ<7o5{g5^hIzB)k-FR6jcYaBx+52{Ca~Fm6PBR zEz}BjdE1N4E$He(TL(Hc-i#4yOLOKfk;o`L8U6L#5XWP(`C|^Z;WaNe@x8DB;d=!( zug;=AQiZMW?o?OZE*cW!W#{7s)EJTv1JnaA{sIckZR`1{h`xvlOPRx;kh?><4M1|# z?_^J0ZPlnC9Tq3G|l=-*LDb`dg8Kwsz+=;C&gEW^1Sa$qsBmPYM?g<;?bDQ8ipzO3k!!OpA*#I zGrnjGNU4q$v$P=>7jL@iC8Oz=iLL)I$KAuS4%Ky?eQ|av^TgSUYoE&al+S1-UFYsb zQ)t3=JzBS9CM`aZc-8MyuO{QE=XLpapO|kxF7*c#V|azVzv5k-9#&cN(m;cMYDaY> zz>xLSx}Z%)1OZ)uTO|1DL(=8bRKoS*;5G~2g!8~pePK9kou2)rvHcV)H8X-1gti}i zguMm{%$bfay1oe`dEpIwd5x>-Joav-t!RPRrCu@QEv?=+{nQ?K(j&clK{iI{txECr z;pdNA^8zA0g0u;g)&zdS0H!-e-@lnrG!VNQwfCC1ZQiJqy^MTa&ukQ~XSi-496c9s zSRncMK=F&-{cN7dFGuWaJ}3xd)3!jM`_cR$;~{|!-F!7ZZGwx=j52?&m7%!btA^KO z9?qS5oYqWIJ+YFH%1!^)MUN2L{k10tv*l5RAnuYQf?Ktqb`NF}-8}+4Q~PQkuk8zY zga%3#mx(?0akxacub6)eX6p~ZG<|Ar_3G%n z{}PnH_qommFNqwiHB9WWnj)Xc^I^SXO@FrO)kSkZ{~6>fKt4e~m<3j-{GCRD&73}J z3~6gf%Piv;!^u;rU<#K>zq}0ar;$N~MVxyhiaCMOi0%R1neBV@t#M&5`^ZUrtI(54WGjGXcp9FT>KK&{n@j zY_s@D(Jrq`;m_5h9v#zePiWbm8P=UQT+}bO!CAS0qJyQ?(ig*Id|Q;$A}>`7EoihK zdDsF)s$8jcKXU~{LaR$2o;E+#e{p$7*2$%<`%)qi@BMUEBaT5nmRv#l1@u#i@KpI) za_7EbB(2Lcue&7CxaHZkffHv0_Nx*uK{j!H$>c>_RI!N>_SguU>b~LXbm3-lFr+zV zWYbjyaIgg%!+A0<>_hW8^Bt^SmEU?^oiiT^Yivt+l6tYV)($&y72^I7KC!X6X`5V7 zUqT2Vm&}`D=x0if_$-aS&db!K7uPCxAoYsI?ER5YDbqYn{uB6o3|nLs@JkRoK52bG zv%zf@6rV9C0C&~>Y`T7s6{+DeYv0LUZSwo&tHVz^>4oNMowOh~u}6k2W*+Fs^zgR- z4%7VUtAfG@jI08ZoGzeCKEjR91Nwj-DC1^=ydfTXXX{PJv*I?;Zly}#UB`UepyRd_ zgv{&)s*#554|-!@(BVXLTrSo*oqvZ0Y=dZ zN5pnHilDL7(h(tNK$nS$iAPt#D@Ro~L>nVtK#ixPN#%6&mqEpmWi47c&3Cfixn4g`+&BZ*pA(V>;{$KlBjV36H zyhiAoXOoCp1YH%1%DY~AEpldoWo)0W!7iv)YIL~b$f#k=-**4ar%)~ZdT5rnJ-?Oj znj7E=ayPNMa&T;I`MxzSZdfjX^}_N~_?jA#jJ)%_1TRH6TzYHl;l?XSO&X`Y?)N_z z)4jW{YEwfYM!41MLGRXy`5vo7en5DpuT5^`yp6llQypHbI@(dTbd-LtUIBQD5jsEw zsFKgl+Mak?0V@gIc(D-#yNLAxWNQQ>7Q>1nlKPJ%{Maypx5XJ&86VVPswknie2Kd6 zYvPB_Z$@llmtokxcH!JvqU`b=<3%kX)Nc=S?H$K);Ul9+a_uvT;`JKLMgqRL|0_Ma zKoloW1`bqc2)nGPy)S+&Z=6%8(a} zFLG5*UgpDZdfgAU=s@1(E54v~9?`PaNV!ykc=ESfvVW5E*R-ghC@) zpz~#JbHi)e0$mT>Cv;tDEi5h6G~Q=fMWQu?*v|==c@eucF%B5gL4G|oo?m5kWd!2x zErinK+s3LZF9g)H0#H)p6Z6i(*$L4hnaS5W00Z|w+B@s0s=jvLOGty#Al)E{gl5ll<1C+WFGB-^H{jW~&hnxeEH$qcpDs$D zvtU6&PGd-66- z*#DW&wfd((I-5lZTUsO4jBxt!l$p&Vg;s73)Ae>%S`E^NfjW{zkDU(lqr>2G@C=Mp1feUMs34a; zA=$!+R@Ir>zW>y&%CP&;&vYI55fG2?0t*2;c03`lk069b%r0#U*7j>&*S@ZtxU>55 z5+>9^aAzV!@CqrQ!`4}!wmOfzJ^>08if0#Ox+HC6<@_|K0&COelgrbkv2Hv@|UKb36cEZ-dbw{gh=PGVaSDL{ZmHg8=`i!E$M5-9#u3bcsN^OEZ@qG6eOL) z>V_arIeQl}yIJ3=iQUJ9l1&mnIr~A<v5o|%bu_<%m{naJs*3Yab)XW8foGy*nUe~{c%-i+1%lb$m{gbWy%1; ztH&ejpK1xiY$oYeSD_CN>;;Y z4o>HP_+Z5y=aYbVK>OX?;}gi0Y-cfoNU)Fl>5L!$2;fJ#{!NqEU^WhHDg%`prn@8| z+;)Mj5n~e(pGWUI#MPDIM2?UJfAD%LLJ+*T*PNF*czG+o$YRq?FA=Bh3_S>WEx-Bh zvh`Pkd}pFfarmt&mhkffdpR~mR^cCa`Z;oJ^3IE}^M_K|d-D)iysR2UH@{z>+h{vA zXmMa<{H}H-Y)lRILlj6Drv9Yw-ZZd?V$nxW3C&mKiI5lRf8ZuJ^{Pedlx#3vZpAqD zCrz6@1*@Q7VwwPs<%S|2gU{H}J##{N{Slv|a3qyExK^p4UZ=R`Um4`gO4YlhIK1)T z8V1k1j+0%CxxX&Ipl<@_zt zdX@bk&su>^?~u0bS{P6K*RdjbnP3vn(F*+t7ni9dy?&j4B*e)${`kx-WM4zYFd^4+ z_WkJt8B zs_xHQ5kR_5*ylBs|Lj`10)0DlX{}{2dub#W(B>P8dm{qdPZOKvsb;U#Ru4dnDh4nPoAWOl6T`P|7^$p1MUW#tXVO)bDx+`TYh3l>pP&-yySb+qugQ3VdUl zAN6$p2{-3nqkPE`0dr^qu4k*%Hc=}X3f)ZVH0~eMhmtgsd7#>W!oJ-*ppq2a$2V+| zrII06Bc&SCM>00{VLKBk3pXf zFkfkv4HuknadYVj9~s0$VzwB=#mWB8hQo7J>|NndeY@7~(Xl~EepNrS%sX6dSo4x= zIUCL7#0}@>A(0+g*9Me~tRgCbJNDGu39V?XQF-VosGIq4Gh*lw_ zX5m=)Sj(0j>5$-3Lc({M#!wBaZc-1M>Hz~%I`}Ns z5N}tOLq>?1?J&AU@i;tv1b~zwN%!hr&ah^Bo#wMhH;1QU-YS!Npu#A+GWk07^>Sj> zx9*F-(IrT81=#k*%d02IeWCYyUwe3yl*#Tknsv5%%n&3-#9|Ws#TxtGVO(&D6rF|Y zM$9b6j+h!a;p~UJJg5>p`6+?U+##0?H8#&Ma9HXaF`ZQZ*K~5cH(-nhyveul*5mK( zUoWQse!CnpZ4|AS-zgM)pu*fv!{nL#>Mzt)?-_E@rtxR?r;L?Kx@3{>kVSFn+8<}A z;|r%?bOgn*P|JOSe^+bz6uhi09sLeh2p6v|_g_P)i2bYB%uH)~E;?k6+tn1TcJ)C_ z*+tB!X8JkyzrSn$^>yh~z6gA4=@tc^81bSZyj(UAUkIp{{y@jCf3)GB{BR}l8SF{V zkhb*fzs8vLCJ@_XmQ0y99J0u)+z4}<1L`US`PU(DV;}uc8V0=bFwlyiFqgS<4vu*M zgSX5Pw&*#Y3`+-`I~*zanF_CvFQqt)HgHo?xVByc6pwNtLxR#`dH+}e_X@f3wR>Vt zmL-7o{)vVKzsf}e1bH;O(B>Y&eN=>7&r%q5SY^P`)JpZtx3BLIx8P7><;&#n9a zv5Uo6B-GD1{#GB;BCF7r{&>lzQd7*y_BXnj7P%6T-~}C7DSziK8l^`pjJvn%{?5Vw zbK?KgOaIdq|5v-BLg5#}?z9XDOcC+b4TqXQ3K6(T@B^=dj~$`5o-2Q;dHqFnbGY4h z@-@8$4y4Vj2o;)T#Qwy);*98^f0$^#vmAtb-((@=WoBv8WBUAkwmPjyPlfw=O_S2? zS^j#=e!lvp`{zpI5G-JuYk;$S@FIY5J3zppZ3SXU`GD~~-QK!ZkQ7GnlfjB!N70qE zw+bM+hd^4uFHGBnTJPF8fd@|8Il= z=*=&5QH;($Al)fMhM{sP2z+pY5s~E8~zKpgtLPpzXwx8dcV?08N6xSM6Wipf)Rw_(jB?WWMv#NBaA$u1<`lQ z=^ybPD=9I!G;V7L17Fqrvj$;y-RqgjAZ*T>^qPYo*uemvoEWb-itiFOvBKl3Oc)uA z#6W=P9AUyo-CXLopms`Q?yRc%3H5;wIuT;@gMp>1h>)`Yz>I@O6@y>Lub6ed>x~DGFkt{e}w4VX3FNkwaq5ZWl`dq^wA6?4LH^ zwd)9i+5qf?Y&(V`Y*c>@tpZru#t!US&3kQCm?CR;i8KQppC2~~O{6KzSjN4Hm%o6+ zZ1-}QOM~H%z}W9sdXC6h{ozv0b%Q@ykk+vgc%y`n(MHZKz|%zx0lKX9Z#NFBJmTt| zk#7Jw$yk)V=@>x(0uGM%$OwbmGVmWU!f5cxM-)8%bk~{DRP`j2C`5k9g#?kGZeucM z`K?-q^>GJCzHw6eBfF*mKHD)UXAXLq^a9lP1p)4y-%9XwOeGi>GS#wQ+@yleg}n zAPEL#KA+U$b*tuo;t5c9>`qcypO8Dk`uALcuMf$i z69yK^=Fg{1)AW3X);^5N^F1N!n*;ZdZcKZ)9m#G8MDK!f^6{Tdt-j>`A5uk@BtrBZ0*h1=@%7<}9C zggE$Hpf4R^{F^ItJV6}B^Lk*%TUZ?-zQ|{fDBj1~?l$hWXwN~2z3@G3QCaFy^s@Jz z(S0YUG|!rK>|Mh=@6Dll-1;s4`{^r z7Y8jUg{ufHjkaEiPTc;4cV)f@luPXrU-5xE&Vj#NZ)h3$3bOh_IwOlC&W{V?jRgK@ z5Ox;m=Qr2gSQE(~C>8h-I$QQK-y2Plv*5VYCntu?{1gao9>)oVA%SgF_q0HRxPGs$ zkZE+|6Go{;Z%L{#n|t^V z6Xy;#WIKJ#k#sRocF*RB(fPczKVIT>+Fo3U4c2`9qP`lRC^WOOvuNOKIEPF&QWem&t)7a3Qw-pA5QE?57ZtoEU zXKd$H3SuOJN8p7w1&+p3U6R)U(|VKaZ5wzFHDUO^wajAmve$hM2cX`;eF^L+Q`XR& z#i1>1)v}}SpkElr+Pg1P+a^b-aXxUD#@)*1XF6 zF-Xg+!}1!VL&ERD2%S=|)crFw)1R(;6f1IS?5R4Xn64vm#ms&xr?Izcm$3>Q!Dp6j zR`{$b&kBz+Mq2#+v3*Q#$x!xkf`oozM^|gzhBPH=|oM?ViBw|8Lru21_ zWp~+a)p*HY>i$~M&JeIlV5#qRAtYaR^mTqmKZ=?0T@&IU@wysK%z#hCH-HuQ2Ovef7VDJ65C~!XKuwsOr)8(O7 z68|AL8|S$400_bej}gZx$s0^sT6X_nm4zxXvl+f0Jykx<`AxP<>T8G3x=5$|LWPa# z$>c4#Z`5+W#EO#@?~p%Uv?WfpRpA8g8^hGX+t4Abjda+r%hKb4I=aH}hS9<4Tll`I z7OTXvYO`_oBMmS^HyC3q1v%A@e8|AjAo2oW{mHSqx^et~)jC&D3$;cNv-xC9Ki$o25D`IYlihVQs^jWNFa1t#udL4C{UEQu$k6 z7oHR_)!P4HSki8RlcxZ$u&;9YJJ=<5%ekLw_E-92^JR+%_XOJtS^VYTn-@|gHV27+5w@Ir&b_D2p;@XH`!>jgz!9# zX_ALQ9U~xtEuS#cle=7h=nitpQrHYh7m4X*`bWhb$H&uWgaYAlxzS>J=WkFxqtL{2 zo||F6GCw*z3LYcqlSlg-AShiaNS)r&p&(pkjxaVW*osePxL?cJhU@QmVi`5fjwnwW$**lUVXa7rf6mj?P7b7hxBvAlk8^4HkJu zN{9Bu@l|iqyt<8^(ve?AZ-orZ6xrxU&^Ogbvm_oaJoidr`HH=dAyZZPPvJD6B9Mbn zF9$w;T{FMrBJ2uTk&6ZeLZxg`+%IP>`i)XT!md?9A4YLm8@4 zhxqcOtM-DMBdRj&vFAOG7$KUFrYD{GoGk17EFErPN`Gb(XlABb1Dqc()_xZY6sIIu z;dayJY>0k&N!=cy>R%*ikhM!OIfag{VHv*SHfJUJZd2!t#sd-m_A&WQ@@W<(pNrhl?|hOmXfP`TQsiGd1dsSRIPq#_&vhivxu}RsWRH zAZ9MU(c1pSrJu{S>>1h7R40v&8FQ`h2s9y&ZbPr*!5iWb@uGWN4^+6>dqkbQsx;xRVtQZdmnU&DpMnTwyI(4I)3nuN6cvEA=~xyzc!ztGuU)A?$Jpu(3T z@>f(Kww*X;)US_$BCHtnfq_WR#l@L6C6wM8>V&l7O5$q}c@hz;9zA@E`nLaL5WRuc z!}j~HJZp4sue1CGqZ4X3=6E;eYI{nS77}=66^BYpbL$HW3QvUJ|2$qMkIS(X{gL5C z3@!dXag86EdjtDjPH#C?F5TAz922U)Ua|`B1~}Q#v%?4jd4Dfk&G!&N>slthy$?PY zAM#0ke`=|!VUgFEGRjM7>Ugfrk4H&?)baGxHgv|Jwl0dcrP`cLzn1983HIca_{Oz1 zkKTP;k5;nKHlv8l?6c4l@5`PH(%Gy6^5jfVViN)h9=@5)C^-8cE^;pcf-vDj6_9Ey$EfRU7YvT*hx{f;AVA=6%6SG*AWb1_!Io1+E z<97mzUeywMQ+0>7jvGgm&)6dv+^+{AK+6qh8Yf21Y?)t8fC)6{dU!(Js<2YF$!FesAA5F_;IiM9L9 zVV8&Ecip#P=iLn9Cp#t4xKa#)=k!Y~y_>4MN=}XC*MIVRONI9#aSR$plB9~tW|6e_ zkT7s0#NZ#wKNLQCXO=|7ZET6Et;o54ll=92pz#-9T_-C!CtPu1DH_idKH^C~Aj0;( z-=gGHp9o5@aMk$de9_xu?HlewdOlrfh0o&$d)=L8w>uwU&(FL{hVSRRFKmalVrB1j z;qj=@X|C1vMFh4V7bzR{7$@#O#h!f;enHvk5g1g|lS(=i%!Hbo6W47HAVftLTYOXyYP^BF#|iRe4eo*b+l^5EoMSEL}Tp;H>S)J_^h)rVH%C$?A@}Z=&)@Dl$iEfTA#5**k}eU6ff zTV1K5O7U$IBaw!(7TF@nePK?$C|Yq6&o?@{Yf_^7IDfoACW_;=vP22FX8Fvqr7E=Z zqUo_D92MH+JzHfR>y&YIXgKEY9`x71hz9&MFsQihe@<}WN)zbX?w#X@>NtKqQE4iq z43=GCR2@QDE=9>Vu&qqQ)F|7sj6w?=?ZuQ-Pk8OooTYaSgw1BySHK|Iyvj6HJ!LJF zeUV4t2fya4A(_Lr%Y(Rzwl(uzt*bGeQryIs&%!sQG#3X8h!%hE&Q@zs{iwo!NLFQ| z1^s3xrQ&`(z!$v1&^c}`7yxU-XJs!xp$aRWec>{YedJQz`|)#?xwo6i_d)5%HKJK) z)%^J|nd0IHSoCwR=xxgq%kjKae8emWMhDD-vTg;K7yQK*QtFpDt>p`&bXUsHG{%TDD5U+66NaAnGtH?p9NEl*zM5sn|2YdDVChVyD zjuzzi6ZdQsZ0;vT<&L;hAJ)|Q(^9VlMcOkEPz3X{h6Z73w71_ZU#Sv9m1kU6HqW0= zE8|Yf$#8B)t2IQ`pD1;4GTWVBs~52kOdTu|V8xE=df&!A$usmM5EbD>S2L2V{=}VD zcS^S75wVfuDSG2Crh#EbqE6I4!Io#>mV{=(6LoZ+XS3eCzn@577+P-G8L+4ui~rK- zMR)&bciL$q=2zUq>7_*6O*^IK$Y{3m04)!C)0qB(4rIAiwS6~8IZaHYXRGA!rb*jI3!OQb#fJ288hFt9j5ReCKU zstlDoHOEBNBWQ7}fM1ifJXuw!&+Y`4Icqwz=)4uGHN*7tnTW;tn^njgWX{4u6{8-H zw&NkuG+Z&5oKhvnIcCyo+;9ZXI#X5>sg5gMz19n3ClPl}(fzJMc- zd&ui1j}yk7MI?65Ka=XzOinW&$U)j5)lW%y+4eQwX9_9>o!XYEDr=`q;ml<|aoeli zh+05bKF9c~o4%izHx+PV-4tSIM3|W;;IMvC+yr+&JHdG@zjo8a`9oZVUP*%wBJnh# zOl`0tf3Tm6z45SIjI?<$&Rs_j_^yQmD4<$);j(t#IsKd(Rfu(tjBB_o`25|{$DQUh-?0+NR zT{wf5#lqH?lvoZn_&z;{91HOF{D`Ap`wahWsP zi_jGmB-r)9fYsE6*f5caJJn zc{x<)c)eTALJnb@#wmK{vNzXY!)aK~wF0&u3Z6R-`))PA&d+DeC}x*FM!O6B)Kcn{ zb*0|O-t)EPI&$qtoj5#ZrMQTLrfvTWv;AzU0V@ZQe?Oup*s)${5ZG4{l`?F)*waW= ztr!kHuyC%Z^%tXnrGYty;kWJC&NP%(%ORi0IC0B1fz-E7Vj9o4jL_^6O=u&R)`wnH zPhncafO2|SG&{)X;|&>%%yTAi-HNB8aFKa!}MdPh?U2H~~L*Dd;;8Q(U zLz=yP{{=?(<{PwUBpSTu)%{G4;gKP6uXx+XA1I}%IMJBS&jIB=nF?0p?@SCyV5u~w zxDC|-69DvQ`U)^@3QmRHkm363Z%dsTalUd`D!59W$q>oA3VU;OMEd|a`|#m`U25?} zwpjy^<&Cbz@BdtW!DxPYw8mQAsUq2-Hlt)@uC=ow>ShtCT=uwUG-yGR$!+?es%Gn5 zmAKvkI zffCe8$wgHd)Q#~Gh%KB_GOD5C?-jX8zT8Yk{S}tWiut++Y#O47#=#|WS0+so@5 zTSYY+Tim=UxKgm>%DnzSozs) zvcq`AuN@b!LYkYFq2t`=y9@Bm_j|Krcy32EiS8oS{>8R?(5-nTN+xfA@aEo4y}j8a zK}yQe7#jzRYKi(@5H(o+8T*a-}r_k0!(4Np{V ze;Q=H^;f#Cte)c^WzFAP@=|S5A z(p`mE9G+g*KAvvD>GahC6D3}K8+(UqHO_4EVWP

m^m=cWXDZ7b!E{*~tMYDl7vA zp)oGU6v$w9Z$V{jIP|TD6C7+Ca(<2(_>rh2rv7~Skjy$<28jsPEV4j=7B4!5S@AU~ zu+`%>-qnV(rC?NV51GZvzyA{&owCV=fF_(G5<&z8v$cmE?)@P37Rmc8VWGj^9!|F0 zY^;sY2;z-Jr%7!>hyxoo@4!n(dWia3%L;c|aj~Ts+2tfAs@fgv!!ZFh^w6$yeIO4_c#AMjQR%c~H1iEKRxYS*Rs{?WZCX2%f|$tK3|u=GUoFoon&EjekxZA- z-g(7!TLm1|F^!@;dc0(NOeUUVc9DsAcF!%|zFawJdpgI08wM$=k!B`d#Jc8xPi_H%E(1y{o* zuVT}@+lXLzFlQZW!U5}kQ1N~FzzEnxZ;H6u$n)?QXH(g>dTyGw6x`*G0~9tj`X%L} zhwX&s^L0+OujNMr{(ueB>>Ad+=c=qITEZ`_vF)17+c7ipz~=!}NcqD?)r9 z69Y9Ix5VR9)<4S7Z(w7@n^2%OzY&A)zxZNr}VZQ%)?3-c-;thvZzx(n5WhE%6C#m@ZwpVvOIeIvm zRKeC#Z!WLdNIyHt+^VJ6{s^y=m=Zxoq|Q2dwy+;pDQl0ozJ$FBWqZBClpZg0MJ?@; zrpCm@P{nwj`jN5LnUBs$BxkVWmiE%liWf1;S5kdy+2j$=Omew%m)~#I%le7MVc-af zY>hPT3>ll-KVhBp9Vr~El#McFQ?Q6djI0@&vcR6Yil~acI83_lP&kji?UCesRdIJv zSy|$zFIhF@N1woJ8-@&#U_uv&N&lX#g`Yz^B~9g|>gICyO+3hR0FEBHg~3{rd6R0p z*z|&if*3lZHxVz;QWf?E8SB2zSc@Zw?t4sJa_oK9H;)&ztaFIj)mLwb`Q1TG>!xjd zgru;bbU37S$CM%1>F<$a<=jj(v%UR|+fNUiqPP(rpCL)oEWe=xX5jwJaPLikU;a0j28*UFhKP(EKY5YRd%{W zp_YA#g?(nG(sC5MsN?QSyPKlP7vs#-*cnFgSTg76_Z6o%S`fzeUx`wA9<=iQJ~MTM z633|=3T)39H47Z;MD{(L9bTiTILjFZ{(;VCbW=$I}ov)ai&)3{X) zy*Bff*3eiLc0z6o3(xg39<++~=_}hZQVGH=DNC*`+2i(%wmVI6SxD%Y)XTvsmw}k^ zwjT4(3#*>4<~u74#Tsb^g{s98&ALA%S68EMv$52Ud~W5-PXk^AVxp)pcua9HvXw^Yx`P49gRA5ty%$GF>DmVp&og zl@nihK#WYdkWP{MojydBD=&&V+>U&0dAwNgi8S#(y@zQ{;B?Sn)8+(zX(cxk8! zIy6U&YVsJV=Z=F&5>gz8qipyY64h|mt*4`tfW~c+ra(>-R0Od|6{fxC3-cFi4l07J z``6-Qh=Ig_&plK`K=mJgq+$kn-L_bXVekisgkq7;z1Fqx|BGY6fKVY3L7j}I@Q^Te z;!u^0cH_nea;z8J@-dqG7C>e!#bQs6&d+&}(-SY!w9f4cgAJOFXVpEAg-<$};BL|Ff2 z7ySG8NRkZw`;Y(Sv0^Nk>D^mKl}fDt&6WP!l@LLNQU19#|EaktjG#lK$6sdr_m>LJ o2s#hve>(3!+?fB*citbA9?S~|bmwJBx3rQj2ZvX%Q literal 0 HcmV?d00001 diff --git a/docs/source/index.rst b/docs/source/index.rst index b0103a336..34eb23b28 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -43,27 +43,13 @@ Example .. ipython:: python - import datafusion - from datafusion import col - import pyarrow + from datafusion import SessionContext - # create a context - ctx = datafusion.SessionContext() + ctx = SessionContext() - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]], name="batch_array") + df = ctx.read_csv("pokemon.csv") - # create a new statement - df = df.select( - col("a") + col("b"), - col("a") - col("b"), - ) - - df + df.show() .. _toc.links: @@ -85,9 +71,10 @@ Example user-guide/introduction user-guide/basics - user-guide/configuration + user-guide/data-sources user-guide/common-operations/index user-guide/io/index + user-guide/configuration user-guide/sql diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index 3c97d1ef9..f37378a41 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -20,72 +20,76 @@ Concepts ======== -In this section, we will cover a basic example to introduce a few key concepts. +In this section, we will cover a basic example to introduce a few key concepts. We will use the same +source file as described in the :ref:`Introduction `, the Pokemon data set. -.. code-block:: python +.. ipython:: python - import datafusion - from datafusion import col - import pyarrow + from datafusion import SessionContext, col, lit, functions as f - # create a context - ctx = datafusion.SessionContext() + ctx = SessionContext() - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) + df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") - # create a new statement df = df.select( - col("a") + col("b"), - col("a") - col("b"), + "trip_distance", + col("total_amount").alias("total"), + (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"), ) - # execute and collect the first (and only) batch - result = df.collect()[0] + df.show() -The first statement group: +Session Context +--------------- + +The first statement group creates a :py:class:`~datafusion.context.SessionContext`. .. code-block:: python # create a context ctx = datafusion.SessionContext() -creates a :py:class:`~datafusion.context.SessionContext`, that is, the main interface for executing queries with DataFusion. It maintains the state -of the connection between a user and an instance of the DataFusion engine. Additionally it provides the following functionality: +A Session Context is the main interface for executing queries with DataFusion. It maintains the state +of the connection between a user and an instance of the DataFusion engine. Additionally it provides +the following functionality: -- Create a DataFrame from a CSV or Parquet data source. -- Register a CSV or Parquet data source as a table that can be referenced from a SQL query. -- Register a custom data source that can be referenced from a SQL query. +- Create a DataFrame from a data source. +- Register a data source as a table that can be referenced from a SQL query. - Execute a SQL query +DataFrame +--------- + The second statement group creates a :code:`DataFrame`, .. code-block:: python - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) + # Create a DataFrame from a file + df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") A DataFrame refers to a (logical) set of rows that share the same column names, similar to a `Pandas DataFrame `_. DataFrames are typically created by calling a method on :py:class:`~datafusion.context.SessionContext`, such as :code:`read_csv`, and can then be modified by calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`, and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition. -The third statement uses :code:`Expressions` to build up a query definition. +Expressions +----------- + +The third statement uses :code:`Expressions` to build up a query definition. You can find +explanations for what the functions below do in the user documentation for +:py:func:`~datafusion.col`, :py:func:`~datafusion.lit`, :py:func:`~datafusion.functions.round`, +and :py:func:`~datafusion.expr.Expr.alias`. .. code-block:: python df = df.select( - col("a") + col("b"), - col("a") - col("b"), + "trip_distance", + col("total_amount").alias("total"), + (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"), ) -Finally the :py:func:`~datafusion.dataframe.DataFrame.collect` method converts the logical plan represented by the DataFrame into a physical plan and execute it, -collecting all results into a list of `RecordBatch `_. +Finally the :py:func:`~datafusion.dataframe.DataFrame.show` method converts the logical plan +represented by the DataFrame into a physical plan and execute it, collecting all results and +displaying them to the user. It is important to note that DataFusion performs lazy evaluation +of the DataFrame. Until you call a method such as :py:func:`~datafusion.dataframe.DataFrame.show` +or :py:func:`~datafusion.dataframe.DataFrame.collect`, DataFusion will not perform the query. diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst index 8fee26a15..e458e5fcb 100644 --- a/docs/source/user-guide/common-operations/aggregations.rst +++ b/docs/source/user-guide/common-operations/aggregations.rst @@ -26,15 +26,7 @@ to form a single summary value. For performing an aggregation, DataFusion provid .. ipython:: python - import urllib.request - from datafusion import SessionContext - from datafusion import col, lit - from datafusion import functions as f - - urllib.request.urlretrieve( - "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", - "pokemon.csv", - ) + from datafusion import SessionContext, col, lit, functions as f ctx = SessionContext() df = ctx.read_csv("pokemon.csv") diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index a0b95c908..8d6a80855 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -25,14 +25,8 @@ We'll use the pokemon dataset in the following examples. .. ipython:: python - import urllib.request from datafusion import SessionContext - urllib.request.urlretrieve( - "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", - "pokemon.csv", - ) - ctx = SessionContext() ctx.register_csv("pokemon", "pokemon.csv") df = ctx.table("pokemon") diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst index b15b04c62..d7c708c21 100644 --- a/docs/source/user-guide/common-operations/index.rst +++ b/docs/source/user-guide/common-operations/index.rst @@ -18,6 +18,8 @@ Common Operations ================= +The contents of this section are designed to guide a new user through how to use DataFusion. + .. toctree:: :maxdepth: 2 diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst b/docs/source/user-guide/common-operations/select-and-filter.rst index 075909129..083bcbbd2 100644 --- a/docs/source/user-guide/common-operations/select-and-filter.rst +++ b/docs/source/user-guide/common-operations/select-and-filter.rst @@ -21,18 +21,15 @@ Column Selections Use :py:func:`~datafusion.dataframe.DataFrame.select` for basic column selection. DataFusion can work with several file types, to start simple we can use a subset of the -`TLC Trip Record Data `_ +`TLC Trip Record Data `_, +which you can download `here `_. .. ipython:: python - - import urllib.request - from datafusion import SessionContext - urllib.request.urlretrieve("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet", - "yellow_trip_data.parquet") + from datafusion import SessionContext ctx = SessionContext() - df = ctx.read_parquet("yellow_trip_data.parquet") + df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") df.select("trip_distance", "passenger_count") For mathematical or logical operations use :py:func:`~datafusion.col` to select columns, and give meaningful names to the resulting diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst index 609176897..8225d125a 100644 --- a/docs/source/user-guide/common-operations/windows.rst +++ b/docs/source/user-guide/common-operations/windows.rst @@ -30,16 +30,10 @@ We'll use the pokemon dataset (from Ritchie Vink) in the following examples. .. ipython:: python - import urllib.request from datafusion import SessionContext from datafusion import col from datafusion import functions as f - urllib.request.urlretrieve( - "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", - "pokemon.csv", - ) - ctx = SessionContext() df = ctx.read_csv("pokemon.csv") diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst new file mode 100644 index 000000000..ba5967c97 --- /dev/null +++ b/docs/source/user-guide/data-sources.rst @@ -0,0 +1,187 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _user_guide_data_sources: + +Data Sources +============ + +DataFusion provides a wide variety of ways to get data into a DataFrame to perform operations. + +Local file +---------- + +DataFusion has the abilty to read from a variety of popular file formats, such as :ref:`Parquet `, +:ref:`CSV `, :ref:`JSON `, and :ref:`AVRO `. + +.. ipython:: python + + from datafusion import SessionContext + ctx = SessionContext() + df = ctx.read_csv("pokemon.csv") + df.show() + +Create in-memory +---------------- + +Sometimes it can be convenient to create a small DataFrame from a Python list or dictionary object. +To do this in DataFusion, you can use one of the three functions +:py:func:`~datafusion.context.SessionContext.from_pydict`, +:py:func:`~datafusion.context.SessionContext.from_pylist`, or +:py:func:`~datafusion.context.SessionContext.create_dataframe`. + +As their names suggest, ``from_pydict`` and ``from_pylist`` will create DataFrames from Python +dictionary and list objects, respectively. ``create_dataframe`` assumes you will pass in a list +of list of `PyArrow Record Batches `_. + +The following three examples all will create identical DataFrames: + +.. ipython:: python + + import pyarrow as pa + + ctx.from_pylist([ + { "a": 1, "b": 10.0, "c": "alpha" }, + { "a": 2, "b": 20.0, "c": "beta" }, + { "a": 3, "b": 30.0, "c": "gamma" }, + ]).show() + + ctx.from_pydict({ + "a": [1, 2, 3], + "b": [10.0, 20.0, 30.0], + "c": ["alpha", "beta", "gamma"], + }).show() + + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2, 3]), + pa.array([10.0, 20.0, 30.0]), + pa.array(["alpha", "beta", "gamma"]), + ], + names=["a", "b", "c"], + ) + + ctx.create_dataframe([[batch]]).show() + + +Object Store +------------ + +DataFusion has support for multiple storage options in addition to local files. +The example below requires an appropriate S3 account with access credentials. + +Supported Object Stores are + +- :py:class:`~datafusion.object_store.AmazonS3` +- :py:class:`~datafusion.object_store.GoogleCloud` +- :py:class:`~datafusion.object_store.Http` +- :py:class:`~datafusion.object_store.LocalFileSystem` +- :py:class:`~datafusion.object_store.MicrosoftAzure` + +.. code-block:: python + + from datafusion.object_store import AmazonS3 + + region = "us-east-1" + bucket_name = "yellow-trips" + + s3 = AmazonS3( + bucket_name=bucket_name, + region=region, + access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), + ) + + path = f"s3://{bucket_name}/" + ctx.register_object_store("s3://", s3, None) + + ctx.register_parquet("trips", path) + + ctx.table("trips").show() + +Other DataFrame Libraries +------------------------- + +DataFusion can import DataFrames directly from other libraries, such as +`Polars `_ and `Pandas `_. +Since DataFusion version 42.0.0, any DataFrame library that supports the Arrow FFI PyCapsule +interface can be imported to DataFusion using the +:py:func:`~datafusion.context.SessionContext.from_arrow` function. Older verions of Polars may +not support the arrow interface. In those cases, you can still import via the +:py:func:`~datafusion.context.SessionContext.from_polars` function. + +.. code-block:: python + + import pandas as pd + + data = { "a": [1, 2, 3], "b": [10.0, 20.0, 30.0], "c": ["alpha", "beta", "gamma"] } + pandas_df = pd.DataFrame(data) + + datafusion_df = ctx.from_arrow(pandas_df) + datafusion_df.show() + +.. code-block:: python + + import polars as pl + polars_df = pl.DataFrame(data) + + datafusion_df = ctx.from_arrow(polars_df) + datafusion_df.show() + +Delta Lake +---------- + +DataFusion 43.0.0 and later support the ability to register table providers from sources such +as Delta Lake. This will require a recent version of +`deltalake `_ to provide the required interfaces. + +.. code-block:: python + + from deltalake import DeltaTable + + delta_table = DeltaTable("path_to_table") + ctx.register_table_provider("my_delta_table", delta_table) + df = ctx.table("my_delta_table") + df.show() + +On older versions of ``deltalake`` (prior to 0.22) you can use the +`Arrow DataSet `_ +interface to import to DataFusion, but this does not support features such as filter push down +which can lead to a significant performance difference. + +.. code-block:: python + + from deltalake import DeltaTable + + delta_table = DeltaTable("path_to_table") + ctx.register_dataset("my_delta_table", delta_table.to_pyarrow_dataset()) + df = ctx.table("my_delta_table") + df.show() + +Iceberg +------- + +Coming soon! + +Custom Table Provider +--------------------- + +You can implement a custom Data Provider in Rust and expose it to DataFusion through the +the interface as describe in the :ref:`Custom Table Provider ` +section. This is an advanced topic, but a +`user example `_ +is provided in the DataFusion repository. diff --git a/docs/source/user-guide/introduction.rst b/docs/source/user-guide/introduction.rst index 8abb9113e..7b30ef2b2 100644 --- a/docs/source/user-guide/introduction.rst +++ b/docs/source/user-guide/introduction.rst @@ -39,5 +39,39 @@ You can verify the installation by running: import datafusion datafusion.__version__ +In this documentation we will also show some examples for how DataFusion integrates +with Jupyter notebooks. To install and start a Jupyter labs session use +.. code-block:: shell + + pip install jupyterlab + jupyter lab + +To demonstrate working with DataFusion, we need a data source. Later in the tutorial we will show +options for data sources. For our first example, we demonstrate using a Pokemon dataset that you +can download +`here `_. + +With that file in place you can use the following python example to view the DataFrame in +DataFusion. + +.. ipython:: python + + from datafusion import SessionContext + + ctx = SessionContext() + + df = ctx.read_csv("pokemon.csv") + + df.show() + +If you are working in a Jupyter notebook, you can also use the following to give you a table +display that may be easier to read. + +.. code-block:: shell + + display(df) +.. image:: ../images/jupyter_lab_df_view.png + :width: 800 + :alt: Rendered table showing Pokemon DataFrame diff --git a/docs/source/user-guide/io/avro.rst b/docs/source/user-guide/io/avro.rst index 5f1ff728e..66398ac7f 100644 --- a/docs/source/user-guide/io/avro.rst +++ b/docs/source/user-guide/io/avro.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_avro: + Avro ==== diff --git a/docs/source/user-guide/io/csv.rst b/docs/source/user-guide/io/csv.rst index d2a62bfec..144b6615c 100644 --- a/docs/source/user-guide/io/csv.rst +++ b/docs/source/user-guide/io/csv.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_csv: + CSV === diff --git a/docs/source/user-guide/io/json.rst b/docs/source/user-guide/io/json.rst index f9da3755a..39030db7f 100644 --- a/docs/source/user-guide/io/json.rst +++ b/docs/source/user-guide/io/json.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_json: + JSON ==== `JSON `_ (JavaScript Object Notation) is a lightweight data-interchange format. diff --git a/docs/source/user-guide/io/parquet.rst b/docs/source/user-guide/io/parquet.rst index 75bc981cc..c5b9ca3d4 100644 --- a/docs/source/user-guide/io/parquet.rst +++ b/docs/source/user-guide/io/parquet.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_parquet: + Parquet ======= @@ -22,7 +24,6 @@ It is quite simple to read a parquet file using the :py:func:`~datafusion.contex .. code-block:: python - from datafusion import SessionContext ctx = SessionContext() diff --git a/docs/source/user-guide/io/table_provider.rst b/docs/source/user-guide/io/table_provider.rst index 2ff9ae46f..bd1d6b80f 100644 --- a/docs/source/user-guide/io/table_provider.rst +++ b/docs/source/user-guide/io/table_provider.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _io_custom_table_provider: + Custom Table Provider ===================== diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index efd4038ae..e283f590e 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -473,7 +473,7 @@ def join_on( *on_exprs: Expr, how: Literal["inner", "left", "right", "full", "semi", "anti"] = "inner", ) -> DataFrame: - """Join two :py:class:`DataFrame`using the specified expressions. + """Join two :py:class:`DataFrame` using the specified expressions. On expressions are used to support in-equality predicates. Equality predicates are correctly optimized diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index 3836edec6..a71965f41 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -42,7 +42,7 @@ class LogicalPlan: (table) with a potentially different schema. Plans form a dataflow tree where data flows from leaves up to the root to produce the query result. - `LogicalPlan`s can be created by the SQL query planner, the DataFrame API, + A `LogicalPlan` can be created by the SQL query planner, the DataFrame API, or programmatically (for example custom query languages). """ @@ -107,7 +107,7 @@ def __init__(self, plan: df_internal.ExecutionPlan) -> None: self._raw_plan = plan def children(self) -> List[ExecutionPlan]: - """Get a list of children `ExecutionPlan`s that act as inputs to this plan. + """Get a list of children `ExecutionPlan` that act as inputs to this plan. The returned list will be empty for leaf nodes such as scans, will contain a single value for unary nodes, or two values for binary nodes (such as joins). From 2690e61b360a6224f1cd5b5bf29d8c082b87991d Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 30 Nov 2024 23:25:04 +0800 Subject: [PATCH 079/248] Add datafusion.extract (#959) * feat: add extract function as an alias for date_part * docs: update user guide to include examples for date_part and extract functions * fix: update examples in user guide to use f.to_timestamp for date extraction --- docs/source/user-guide/common-operations/functions.rst | 9 +++++++++ python/datafusion/functions.py | 9 +++++++++ python/tests/test_functions.py | 2 ++ 3 files changed, 20 insertions(+) diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index 8d6a80855..ad71c72ac 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -72,6 +72,15 @@ Convert to timestamps using :py:func:`~datafusion.functions.to_timestamp` df.select(f.to_timestamp(col('"Total"')).alias("timestamp")) +Extracting parts of a date using :py:func:`~datafusion.functions.date_part` (alias :py:func:`~datafusion.functions.extract`) + +.. ipython:: python + + df.select( + f.date_part(literal("month"), f.to_timestamp(col('"Total"'))).alias("month"), + f.extract(literal("day"), f.to_timestamp(col('"Total"'))).alias("day") + ) + String ------ diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 6ad4c50c2..15ad8822f 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -128,6 +128,7 @@ "empty", "encode", "ends_with", + "extract", "exp", "factorial", "find_in_set", @@ -994,6 +995,14 @@ def date_part(part: Expr, date: Expr) -> Expr: return Expr(f.date_part(part.expr, date.expr)) +def extract(part: Expr, date: Expr) -> Expr: + """Extracts a subfield from the date. + + This is an alias for :py:func:`date_part`. + """ + return date_part(part, date) + + def date_trunc(part: Expr, date: Expr) -> Expr: """Truncates the date to a specified level of precision.""" return Expr(f.date_trunc(part.expr, date.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 0d40032bb..0d2fa8f94 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -866,6 +866,7 @@ def test_temporal_functions(df): f.to_timestamp_seconds(literal("2023-09-07 05:06:14.523952")), f.to_timestamp_millis(literal("2023-09-07 05:06:14.523952")), f.to_timestamp_micros(literal("2023-09-07 05:06:14.523952")), + f.extract(literal("day"), column("d")), ) result = df.collect() assert len(result) == 1 @@ -903,6 +904,7 @@ def test_temporal_functions(df): assert result.column(9) == pa.array( [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") ) + assert result.column(10) == pa.array([31, 26, 2], type=pa.float64()) def test_case(df): From 5c834934dec89bd96ff70df3b278e9d6fe78f7ec Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 2 Dec 2024 08:05:52 -0700 Subject: [PATCH 080/248] chore: Prepare 43.0.0 Release (#960) * Generate changelog * cargo update --- CHANGELOG.md | 695 +---------------------------------- Cargo.lock | 410 +++++++++++---------- Cargo.toml | 2 +- dev/changelog/43.0.0.md | 73 ++++ dev/changelog/pre-43.0.0.md | 715 ++++++++++++++++++++++++++++++++++++ 5 files changed, 1014 insertions(+), 881 deletions(-) create mode 100644 dev/changelog/43.0.0.md create mode 100644 dev/changelog/pre-43.0.0.md diff --git a/CHANGELOG.md b/CHANGELOG.md index ae3a2348a..ae40911d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,697 +19,4 @@ # DataFusion Python Changelog -## [42.0.0](https://github.com/apache/datafusion-python/tree/42.0.0) (2024-10-06) - -This release consists of 20 commits from 6 contributors. See credits at the end of this changelog for more information. - -**Implemented enhancements:** - -- feat: expose between [#868](https://github.com/apache/datafusion-python/pull/868) (mesejo) -- feat: make register_csv accept a list of paths [#883](https://github.com/apache/datafusion-python/pull/883) (mesejo) -- feat: expose http object store [#885](https://github.com/apache/datafusion-python/pull/885) (mesejo) - -**Fixed bugs:** - -- fix: Calling `count` on a pyarrow dataset results in an error [#843](https://github.com/apache/datafusion-python/pull/843) (Michael-J-Ward) - -**Other:** - -- Upgrade datafusion [#867](https://github.com/apache/datafusion-python/pull/867) (emgeee) -- Feature/aggregates as windows [#871](https://github.com/apache/datafusion-python/pull/871) (timsaucer) -- Fix regression on register_udaf [#878](https://github.com/apache/datafusion-python/pull/878) (timsaucer) -- build(deps): upgrade setup-protoc action and protoc version number [#873](https://github.com/apache/datafusion-python/pull/873) (Michael-J-Ward) -- build(deps): bump prost-types from 0.13.2 to 0.13.3 [#881](https://github.com/apache/datafusion-python/pull/881) (dependabot[bot]) -- build(deps): bump prost from 0.13.2 to 0.13.3 [#882](https://github.com/apache/datafusion-python/pull/882) (dependabot[bot]) -- chore: remove XFAIL from passing tests [#884](https://github.com/apache/datafusion-python/pull/884) (Michael-J-Ward) -- Add user defined window function support [#880](https://github.com/apache/datafusion-python/pull/880) (timsaucer) -- build(deps): bump syn from 2.0.77 to 2.0.79 [#886](https://github.com/apache/datafusion-python/pull/886) (dependabot[bot]) -- fix example of reading parquet from s3 [#896](https://github.com/apache/datafusion-python/pull/896) (sir-sigurd) -- release-testing [#889](https://github.com/apache/datafusion-python/pull/889) (Michael-J-Ward) -- chore(bench): fix create_tables.sql for tpch benchmark [#897](https://github.com/apache/datafusion-python/pull/897) (Michael-J-Ward) -- Add physical and logical plan conversion to and from protobuf [#892](https://github.com/apache/datafusion-python/pull/892) (timsaucer) -- Feature/instance udfs [#890](https://github.com/apache/datafusion-python/pull/890) (timsaucer) -- chore(ci): remove Mambaforge variant from CI [#894](https://github.com/apache/datafusion-python/pull/894) (Michael-J-Ward) -- Use OnceLock to store TokioRuntime [#895](https://github.com/apache/datafusion-python/pull/895) (Michael-J-Ward) - -## Credits - -Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. - -``` - 7 Michael J Ward - 5 Tim Saucer - 3 Daniel Mesejo - 3 dependabot[bot] - 1 Matt Green - 1 Sergey Fedoseev -``` - -Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. - -## [41.0.0](https://github.com/apache/datafusion-python/tree/41.0.0) (2024-09-09) - -This release consists of 19 commits from 6 contributors. See credits at the end of this changelog for more information. - -**Implemented enhancements:** - -- feat: enable list of paths for read_csv [#824](https://github.com/apache/datafusion-python/pull/824) (mesejo) -- feat: better exception and message for table not found [#851](https://github.com/apache/datafusion-python/pull/851) (mesejo) -- feat: make cast accept built-in Python types [#858](https://github.com/apache/datafusion-python/pull/858) (mesejo) - -**Other:** - -- chore: Prepare for 40.0.0 release [#801](https://github.com/apache/datafusion-python/pull/801) (andygrove) -- Add typing-extensions dependency to pyproject [#805](https://github.com/apache/datafusion-python/pull/805) (timsaucer) -- Upgrade deps to datafusion 41 [#802](https://github.com/apache/datafusion-python/pull/802) (Michael-J-Ward) -- Fix SessionContext init with only SessionConfig [#827](https://github.com/apache/datafusion-python/pull/827) (jcrist) -- build(deps): upgrade actions/{upload,download}-artifact@v3 to v4 [#829](https://github.com/apache/datafusion-python/pull/829) (Michael-J-Ward) -- Run ruff format in CI [#837](https://github.com/apache/datafusion-python/pull/837) (timsaucer) -- Add PyCapsule support for Arrow import and export [#825](https://github.com/apache/datafusion-python/pull/825) (timsaucer) -- Feature/expose when function [#836](https://github.com/apache/datafusion-python/pull/836) (timsaucer) -- Add Window Functions for use with function builder [#808](https://github.com/apache/datafusion-python/pull/808) (timsaucer) -- chore: fix typos [#844](https://github.com/apache/datafusion-python/pull/844) (mesejo) -- build(ci): use proper mac runners [#841](https://github.com/apache/datafusion-python/pull/841) (Michael-J-Ward) -- Set of small features [#839](https://github.com/apache/datafusion-python/pull/839) (timsaucer) -- chore: fix docstrings, typos [#852](https://github.com/apache/datafusion-python/pull/852) (mesejo) -- chore: Use datafusion re-exported dependencies [#856](https://github.com/apache/datafusion-python/pull/856) (emgeee) -- add guidelines on separating python and rust code [#860](https://github.com/apache/datafusion-python/pull/860) (Michael-J-Ward) -- Update Aggregate functions to take builder parameters [#859](https://github.com/apache/datafusion-python/pull/859) (timsaucer) - -## Credits - -Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. - -``` - 7 Tim Saucer - 5 Daniel Mesejo - 4 Michael J Ward - 1 Andy Grove - 1 Jim Crist-Harif - 1 Matt Green -``` - -Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. - -## [40.0.0](https://github.com/apache/datafusion-python/tree/40.0.0) (2024-08-09) - -This release consists of 18 commits from 4 contributors. See credits at the end of this changelog for more information. - -- Update changelog for 39.0.0 [#742](https://github.com/apache/datafusion-python/pull/742) (andygrove) -- build(deps): bump uuid from 1.8.0 to 1.9.1 [#744](https://github.com/apache/datafusion-python/pull/744) (dependabot[bot]) -- build(deps): bump mimalloc from 0.1.42 to 0.1.43 [#745](https://github.com/apache/datafusion-python/pull/745) (dependabot[bot]) -- build(deps): bump syn from 2.0.67 to 2.0.68 [#746](https://github.com/apache/datafusion-python/pull/746) (dependabot[bot]) -- Tsaucer/find window fn [#747](https://github.com/apache/datafusion-python/pull/747) (timsaucer) -- Python wrapper classes for all user interfaces [#750](https://github.com/apache/datafusion-python/pull/750) (timsaucer) -- Expose array sort [#764](https://github.com/apache/datafusion-python/pull/764) (timsaucer) -- Upgrade protobuf and remove GH Action googletest-installer [#773](https://github.com/apache/datafusion-python/pull/773) (Michael-J-Ward) -- Upgrade Datafusion 40 [#771](https://github.com/apache/datafusion-python/pull/771) (Michael-J-Ward) -- Bugfix: Calling count with None arguments [#768](https://github.com/apache/datafusion-python/pull/768) (timsaucer) -- Add in user example that compares a two different approaches to UDFs [#770](https://github.com/apache/datafusion-python/pull/770) (timsaucer) -- Add missing exports for wrapper modules [#782](https://github.com/apache/datafusion-python/pull/782) (timsaucer) -- Add PyExpr to_variant conversions [#793](https://github.com/apache/datafusion-python/pull/793) (Michael-J-Ward) -- Add missing expressions to wrapper export [#795](https://github.com/apache/datafusion-python/pull/795) (timsaucer) -- Doc/cross reference [#791](https://github.com/apache/datafusion-python/pull/791) (timsaucer) -- Re-Enable `num_centroids` to `approx_percentile_cont` [#798](https://github.com/apache/datafusion-python/pull/798) (Michael-J-Ward) -- UDAF process all state variables [#799](https://github.com/apache/datafusion-python/pull/799) (timsaucer) - -## Credits - -Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. - -``` - 9 Tim Saucer - 4 Michael J Ward - 3 dependabot[bot] - 2 Andy Grove -``` - -Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. - -## [39.0.0](https://github.com/apache/datafusion-python/tree/39.0.0) (2024-06-25) - -**Merged pull requests:** - -- ci: add substrait feature to linux builds [#720](https://github.com/apache/datafusion-python/pull/720) (Michael-J-Ward) -- Docs deploy action [#721](https://github.com/apache/datafusion-python/pull/721) (Michael-J-Ward) -- update deps [#723](https://github.com/apache/datafusion-python/pull/723) (Michael-J-Ward) -- Upgrade maturin [#725](https://github.com/apache/datafusion-python/pull/725) (Michael-J-Ward) -- Upgrade datafusion 39 [#728](https://github.com/apache/datafusion-python/pull/728) (Michael-J-Ward) -- use ScalarValue::to_pyarrow to convert to python object [#731](https://github.com/apache/datafusion-python/pull/731) (Michael-J-Ward) -- Pyo3 `Bound<'py, T>` api [#734](https://github.com/apache/datafusion-python/pull/734) (Michael-J-Ward) -- github test action: drop python 3.7, add python 3.12 [#736](https://github.com/apache/datafusion-python/pull/736) (Michael-J-Ward) -- Pyarrow filter pushdowns [#735](https://github.com/apache/datafusion-python/pull/735) (Michael-J-Ward) -- build(deps): bump syn from 2.0.66 to 2.0.67 [#738](https://github.com/apache/datafusion-python/pull/738) (dependabot[bot]) -- Pyo3 refactorings [#740](https://github.com/apache/datafusion-python/pull/740) (Michael-J-Ward) -- UDAF `sum` workaround [#741](https://github.com/apache/datafusion-python/pull/741) (Michael-J-Ward) - -## [38.0.1](https://github.com/apache/datafusion-python/tree/38.0.1) (2024-05-25) - -**Implemented enhancements:** - -- feat: add python bindings for ends_with function [#693](https://github.com/apache/datafusion-python/pull/693) (richtia) -- feat: expose `named_struct` in python [#700](https://github.com/apache/datafusion-python/pull/700) (Michael-J-Ward) - -**Merged pull requests:** - -- Add document about basics of working with expressions [#668](https://github.com/apache/datafusion-python/pull/668) (timsaucer) -- chore: Update Python release process now that DataFusion is TLP [#674](https://github.com/apache/datafusion-python/pull/674) (andygrove) -- Fix Docs [#676](https://github.com/apache/datafusion-python/pull/676) (Michael-J-Ward) -- Add examples from TPC-H [#666](https://github.com/apache/datafusion-python/pull/666) (timsaucer) -- fix conda nightly builds, attempt 2 [#689](https://github.com/apache/datafusion-python/pull/689) (Michael-J-Ward) -- Upgrade to datafusion 38 [#691](https://github.com/apache/datafusion-python/pull/691) (Michael-J-Ward) -- chore: update to maturin's recommended project layout for rust/python… [#695](https://github.com/apache/datafusion-python/pull/695) (Michael-J-Ward) -- chore: update cargo deps [#698](https://github.com/apache/datafusion-python/pull/698) (Michael-J-Ward) -- feat: add python bindings for ends_with function [#693](https://github.com/apache/datafusion-python/pull/693) (richtia) -- feat: expose `named_struct` in python [#700](https://github.com/apache/datafusion-python/pull/700) (Michael-J-Ward) -- Website fixes [#702](https://github.com/apache/datafusion-python/pull/702) (Michael-J-Ward) - -## [37.1.0](https://github.com/apache/datafusion-python/tree/37.1.0) (2024-05-08) - -**Implemented enhancements:** - -- feat: add execute_stream and execute_stream_partitioned [#610](https://github.com/apache/datafusion-python/pull/610) (mesejo) - -**Documentation updates:** - -- docs: update docs CI to install python-311 requirements [#661](https://github.com/apache/datafusion-python/pull/661) (Michael-J-Ward) - -**Merged pull requests:** - -- Switch to Ruff for Python linting [#529](https://github.com/apache/datafusion-python/pull/529) (andygrove) -- Remove sql-on-pandas/polars/cudf examples [#602](https://github.com/apache/datafusion-python/pull/602) (andygrove) -- build(deps): bump object_store from 0.9.0 to 0.9.1 [#611](https://github.com/apache/datafusion-python/pull/611) (dependabot[bot]) -- More missing array funcs [#605](https://github.com/apache/datafusion-python/pull/605) (judahrand) -- feat: add execute_stream and execute_stream_partitioned [#610](https://github.com/apache/datafusion-python/pull/610) (mesejo) -- build(deps): bump uuid from 1.7.0 to 1.8.0 [#615](https://github.com/apache/datafusion-python/pull/615) (dependabot[bot]) -- Bind SQLOptions and relative ctx method #567 [#588](https://github.com/apache/datafusion-python/pull/588) (giacomorebecchi) -- bugfix: no panic on empty table [#613](https://github.com/apache/datafusion-python/pull/613) (mesejo) -- Expose `register_listing_table` [#618](https://github.com/apache/datafusion-python/pull/618) (henrifroese) -- Expose unnest feature [#641](https://github.com/apache/datafusion-python/pull/641) (timsaucer) -- Update domain names and paths in asf yaml [#643](https://github.com/apache/datafusion-python/pull/643) (andygrove) -- use python 3.11 to publish docs [#645](https://github.com/apache/datafusion-python/pull/645) (andygrove) -- docs: update docs CI to install python-311 requirements [#661](https://github.com/apache/datafusion-python/pull/661) (Michael-J-Ward) -- Upgrade Datafusion to v37.1.0 [#669](https://github.com/apache/datafusion-python/pull/669) (Michael-J-Ward) - -## [36.0.0](https://github.com/apache/datafusion-python/tree/36.0.0) (2024-03-02) - -**Implemented enhancements:** - -- feat: Add `flatten` array function [#562](https://github.com/apache/datafusion-python/pull/562) (mobley-trent) - -**Documentation updates:** - -- docs: Add ASF attribution [#580](https://github.com/apache/datafusion-python/pull/580) (simicd) - -**Merged pull requests:** - -- Allow PyDataFrame to be used from other projects [#582](https://github.com/apache/datafusion-python/pull/582) (andygrove) -- docs: Add ASF attribution [#580](https://github.com/apache/datafusion-python/pull/580) (simicd) -- Add array functions [#560](https://github.com/apache/datafusion-python/pull/560) (ongchi) -- feat: Add `flatten` array function [#562](https://github.com/apache/datafusion-python/pull/562) (mobley-trent) - -## [35.0.0](https://github.com/apache/datafusion-python/tree/35.0.0) (2024-01-20) - -**Merged pull requests:** - -- build(deps): bump syn from 2.0.41 to 2.0.43 [#559](https://github.com/apache/datafusion-python/pull/559) (dependabot[bot]) -- build(deps): bump tokio from 1.35.0 to 1.35.1 [#558](https://github.com/apache/datafusion-python/pull/558) (dependabot[bot]) -- build(deps): bump async-trait from 0.1.74 to 0.1.77 [#556](https://github.com/apache/datafusion-python/pull/556) (dependabot[bot]) -- build(deps): bump pyo3 from 0.20.0 to 0.20.2 [#557](https://github.com/apache/datafusion-python/pull/557) (dependabot[bot]) - -## [34.0.0](https://github.com/apache/datafusion-python/tree/34.0.0) (2023-12-28) - -**Merged pull requests:** - -- Adjust visibility of crate private members & Functions [#537](https://github.com/apache/datafusion-python/pull/537) (jdye64) -- Update json.rst [#538](https://github.com/apache/datafusion-python/pull/538) (ray-andrew) -- Enable mimalloc local_dynamic_tls feature [#540](https://github.com/apache/datafusion-python/pull/540) (jdye64) -- Enable substrait feature to be built by default in CI, for nightlies … [#544](https://github.com/apache/datafusion-python/pull/544) (jdye64) - -## [33.0.0](https://github.com/apache/datafusion-python/tree/33.0.0) (2023-11-16) - -**Merged pull requests:** - -- First pass at getting architectured builds working [#350](https://github.com/apache/datafusion-python/pull/350) (charlesbluca) -- Remove libprotobuf dep [#527](https://github.com/apache/datafusion-python/pull/527) (jdye64) - -## [32.0.0](https://github.com/apache/datafusion-python/tree/32.0.0) (2023-10-21) - -**Implemented enhancements:** - -- feat: expose PyWindowFrame [#509](https://github.com/apache/datafusion-python/pull/509) (dlovell) -- add Binary String Functions;encode,decode [#494](https://github.com/apache/datafusion-python/pull/494) (jiangzhx) -- add bit_and,bit_or,bit_xor,bool_add,bool_or [#496](https://github.com/apache/datafusion-python/pull/496) (jiangzhx) -- add first_value last_value [#498](https://github.com/apache/datafusion-python/pull/498) (jiangzhx) -- add regr\_\* functions [#499](https://github.com/apache/datafusion-python/pull/499) (jiangzhx) -- Add random missing bindings [#522](https://github.com/apache/datafusion-python/pull/522) (jdye64) -- Allow for multiple input files per table instead of a single file [#519](https://github.com/apache/datafusion-python/pull/519) (jdye64) -- Add support for window function bindings [#521](https://github.com/apache/datafusion-python/pull/521) (jdye64) - -**Merged pull requests:** - -- Prepare 31.0.0 release [#500](https://github.com/apache/datafusion-python/pull/500) (andygrove) -- Improve release process documentation [#505](https://github.com/apache/datafusion-python/pull/505) (andygrove) -- add Binary String Functions;encode,decode [#494](https://github.com/apache/datafusion-python/pull/494) (jiangzhx) -- build(deps): bump mimalloc from 0.1.38 to 0.1.39 [#502](https://github.com/apache/datafusion-python/pull/502) (dependabot[bot]) -- build(deps): bump syn from 2.0.32 to 2.0.35 [#503](https://github.com/apache/datafusion-python/pull/503) (dependabot[bot]) -- build(deps): bump syn from 2.0.35 to 2.0.37 [#506](https://github.com/apache/datafusion-python/pull/506) (dependabot[bot]) -- Use latest DataFusion [#511](https://github.com/apache/datafusion-python/pull/511) (andygrove) -- add bit_and,bit_or,bit_xor,bool_add,bool_or [#496](https://github.com/apache/datafusion-python/pull/496) (jiangzhx) -- use DataFusion 32 [#515](https://github.com/apache/datafusion-python/pull/515) (andygrove) -- add first_value last_value [#498](https://github.com/apache/datafusion-python/pull/498) (jiangzhx) -- build(deps): bump regex-syntax from 0.7.5 to 0.8.1 [#517](https://github.com/apache/datafusion-python/pull/517) (dependabot[bot]) -- build(deps): bump pyo3-build-config from 0.19.2 to 0.20.0 [#516](https://github.com/apache/datafusion-python/pull/516) (dependabot[bot]) -- add regr\_\* functions [#499](https://github.com/apache/datafusion-python/pull/499) (jiangzhx) -- Add random missing bindings [#522](https://github.com/apache/datafusion-python/pull/522) (jdye64) -- build(deps): bump rustix from 0.38.18 to 0.38.19 [#523](https://github.com/apache/datafusion-python/pull/523) (dependabot[bot]) -- Allow for multiple input files per table instead of a single file [#519](https://github.com/apache/datafusion-python/pull/519) (jdye64) -- Add support for window function bindings [#521](https://github.com/apache/datafusion-python/pull/521) (jdye64) -- Small clippy fix [#524](https://github.com/apache/datafusion-python/pull/524) (andygrove) - -## [31.0.0](https://github.com/apache/datafusion-python/tree/31.0.0) (2023-09-12) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/28.0.0...31.0.0) - -**Implemented enhancements:** - -- feat: add case function (#447) [#448](https://github.com/apache/datafusion-python/pull/448) (mesejo) -- feat: add compression options [#456](https://github.com/apache/datafusion-python/pull/456) (mesejo) -- feat: add register_json [#458](https://github.com/apache/datafusion-python/pull/458) (mesejo) -- feat: add basic compression configuration to write_parquet [#459](https://github.com/apache/datafusion-python/pull/459) (mesejo) -- feat: add example of reading parquet from s3 [#460](https://github.com/apache/datafusion-python/pull/460) (mesejo) -- feat: add register_avro and read_table [#461](https://github.com/apache/datafusion-python/pull/461) (mesejo) -- feat: add missing scalar math functions [#465](https://github.com/apache/datafusion-python/pull/465) (mesejo) - -**Documentation updates:** - -- docs: include pre-commit hooks section in contributor guide [#455](https://github.com/apache/datafusion-python/pull/455) (mesejo) - -**Merged pull requests:** - -- Build Linux aarch64 wheel [#443](https://github.com/apache/datafusion-python/pull/443) (gokselk) -- feat: add case function (#447) [#448](https://github.com/apache/datafusion-python/pull/448) (mesejo) -- enhancement(docs): Add user guide (#432) [#445](https://github.com/apache/datafusion-python/pull/445) (mesejo) -- docs: include pre-commit hooks section in contributor guide [#455](https://github.com/apache/datafusion-python/pull/455) (mesejo) -- feat: add compression options [#456](https://github.com/apache/datafusion-python/pull/456) (mesejo) -- Upgrade to DF 28.0.0-rc1 [#457](https://github.com/apache/datafusion-python/pull/457) (andygrove) -- feat: add register_json [#458](https://github.com/apache/datafusion-python/pull/458) (mesejo) -- feat: add basic compression configuration to write_parquet [#459](https://github.com/apache/datafusion-python/pull/459) (mesejo) -- feat: add example of reading parquet from s3 [#460](https://github.com/apache/datafusion-python/pull/460) (mesejo) -- feat: add register_avro and read_table [#461](https://github.com/apache/datafusion-python/pull/461) (mesejo) -- feat: add missing scalar math functions [#465](https://github.com/apache/datafusion-python/pull/465) (mesejo) -- build(deps): bump arduino/setup-protoc from 1 to 2 [#452](https://github.com/apache/datafusion-python/pull/452) (dependabot[bot]) -- Revert "build(deps): bump arduino/setup-protoc from 1 to 2 (#452)" [#474](https://github.com/apache/datafusion-python/pull/474) (viirya) -- Minor: fix wrongly copied function description [#497](https://github.com/apache/datafusion-python/pull/497) (viirya) -- Upgrade to Datafusion 31.0.0 [#491](https://github.com/apache/datafusion-python/pull/491) (judahrand) -- Add `isnan` and `iszero` [#495](https://github.com/apache/datafusion-python/pull/495) (judahrand) - -## 30.0.0 - -- Skipped due to a breaking change in DataFusion - -## 29.0.0 - -- Skipped - -## [28.0.0](https://github.com/apache/datafusion-python/tree/28.0.0) (2023-07-25) - -**Implemented enhancements:** - -- feat: expose offset in python API [#437](https://github.com/apache/datafusion-python/pull/437) (cpcloud) - -**Merged pull requests:** - -- File based input utils [#433](https://github.com/apache/datafusion-python/pull/433) (jdye64) -- Upgrade to 28.0.0-rc1 [#434](https://github.com/apache/datafusion-python/pull/434) (andygrove) -- Introduces utility for obtaining SqlTable information from a file like location [#398](https://github.com/apache/datafusion-python/pull/398) (jdye64) -- feat: expose offset in python API [#437](https://github.com/apache/datafusion-python/pull/437) (cpcloud) -- Use DataFusion 28 [#439](https://github.com/apache/datafusion-python/pull/439) (andygrove) - -## [27.0.0](https://github.com/apache/datafusion-python/tree/27.0.0) (2023-07-03) - -**Merged pull requests:** - -- LogicalPlan.to_variant() make public [#412](https://github.com/apache/datafusion-python/pull/412) (jdye64) -- Prepare 27.0.0 release [#423](https://github.com/apache/datafusion-python/pull/423) (andygrove) - -## [26.0.0](https://github.com/apache/datafusion-python/tree/26.0.0) (2023-06-11) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/25.0.0...26.0.0) - -**Merged pull requests:** - -- Add Expr::Case when_then_else support to rex_call_operands function [#388](https://github.com/apache/datafusion-python/pull/388) (jdye64) -- Introduce BaseSessionContext abstract class [#390](https://github.com/apache/datafusion-python/pull/390) (jdye64) -- CRUD Schema support for `BaseSessionContext` [#392](https://github.com/apache/datafusion-python/pull/392) (jdye64) -- CRUD Table support for `BaseSessionContext` [#394](https://github.com/apache/datafusion-python/pull/394) (jdye64) - -## [25.0.0](https://github.com/apache/datafusion-python/tree/25.0.0) (2023-05-23) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/24.0.0...25.0.0) - -**Merged pull requests:** - -- Prepare 24.0.0 Release [#376](https://github.com/apache/datafusion-python/pull/376) (andygrove) -- build(deps): bump uuid from 1.3.1 to 1.3.2 [#359](https://github.com/apache/datafusion-python/pull/359) (dependabot[bot]) -- build(deps): bump mimalloc from 0.1.36 to 0.1.37 [#361](https://github.com/apache/datafusion-python/pull/361) (dependabot[bot]) -- build(deps): bump regex-syntax from 0.6.29 to 0.7.1 [#334](https://github.com/apache/datafusion-python/pull/334) (dependabot[bot]) -- upgrade maturin to 0.15.1 [#379](https://github.com/apache/datafusion-python/pull/379) (Jimexist) -- Expand Expr to include RexType basic support [#378](https://github.com/apache/datafusion-python/pull/378) (jdye64) -- Add Python script for generating changelog [#383](https://github.com/apache/datafusion-python/pull/383) (andygrove) - -## [24.0.0](https://github.com/apache/datafusion-python/tree/24.0.0) (2023-05-09) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/23.0.0...24.0.0) - -**Documentation updates:** - -- Fix link to user guide [#354](https://github.com/apache/datafusion-python/pull/354) (andygrove) - -**Merged pull requests:** - -- Add interface to serialize Substrait plans to Python Bytes. [#344](https://github.com/apache/datafusion-python/pull/344) (kylebrooks-8451) -- Add partition_count property to ExecutionPlan. [#346](https://github.com/apache/datafusion-python/pull/346) (kylebrooks-8451) -- Remove unsendable from all Rust pyclass types. [#348](https://github.com/apache/datafusion-python/pull/348) (kylebrooks-8451) -- Fix link to user guide [#354](https://github.com/apache/datafusion-python/pull/354) (andygrove) -- Fix SessionContext execute. [#353](https://github.com/apache/datafusion-python/pull/353) (kylebrooks-8451) -- Pub mod expr in lib.rs [#357](https://github.com/apache/datafusion-python/pull/357) (jdye64) -- Add benchmark derived from TPC-H [#355](https://github.com/apache/datafusion-python/pull/355) (andygrove) -- Add db-benchmark [#365](https://github.com/apache/datafusion-python/pull/365) (andygrove) -- First pass of documentation in mdBook [#364](https://github.com/apache/datafusion-python/pull/364) (MrPowers) -- Add 'pub' and '#[pyo3(get, set)]' to DataTypeMap [#371](https://github.com/apache/datafusion-python/pull/371) (jdye64) -- Fix db-benchmark [#369](https://github.com/apache/datafusion-python/pull/369) (andygrove) -- Docs explaining how to view query plans [#373](https://github.com/apache/datafusion-python/pull/373) (andygrove) -- Improve db-benchmark [#372](https://github.com/apache/datafusion-python/pull/372) (andygrove) -- Make expr member of PyExpr public [#375](https://github.com/apache/datafusion-python/pull/375) (jdye64) - -## [23.0.0](https://github.com/apache/datafusion-python/tree/23.0.0) (2023-04-23) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/22.0.0...23.0.0) - -**Merged pull requests:** - -- Improve API docs, README, and examples for configuring context [#321](https://github.com/apache/datafusion-python/pull/321) (andygrove) -- Osx build linker args [#330](https://github.com/apache/datafusion-python/pull/330) (jdye64) -- Add requirements file for python 3.11 [#332](https://github.com/apache/datafusion-python/pull/332) (r4ntix) -- mac arm64 build [#338](https://github.com/apache/datafusion-python/pull/338) (andygrove) -- Add conda.yaml baseline workflow file [#281](https://github.com/apache/datafusion-python/pull/281) (jdye64) -- Prepare for 23.0.0 release [#335](https://github.com/apache/datafusion-python/pull/335) (andygrove) -- Reuse the Tokio Runtime [#341](https://github.com/apache/datafusion-python/pull/341) (kylebrooks-8451) - -## [22.0.0](https://github.com/apache/datafusion-python/tree/22.0.0) (2023-04-10) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/21.0.0...22.0.0) - -**Merged pull requests:** - -- Fix invalid build yaml [#308](https://github.com/apache/datafusion-python/pull/308) (andygrove) -- Try fix release build [#309](https://github.com/apache/datafusion-python/pull/309) (andygrove) -- Fix release build [#310](https://github.com/apache/datafusion-python/pull/310) (andygrove) -- Enable datafusion-substrait protoc feature, to remove compile-time dependency on protoc [#312](https://github.com/apache/datafusion-python/pull/312) (andygrove) -- Fix Mac/Win release builds in CI [#313](https://github.com/apache/datafusion-python/pull/313) (andygrove) -- install protoc in docs workflow [#314](https://github.com/apache/datafusion-python/pull/314) (andygrove) -- Fix documentation generation in CI [#315](https://github.com/apache/datafusion-python/pull/315) (andygrove) -- Source wheel fix [#319](https://github.com/apache/datafusion-python/pull/319) (andygrove) - -## [21.0.0](https://github.com/apache/datafusion-python/tree/21.0.0) (2023-03-30) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/20.0.0...21.0.0) - -**Merged pull requests:** - -- minor: Fix minor warning on unused import [#289](https://github.com/apache/datafusion-python/pull/289) (viirya) -- feature: Implement `describe()` method [#293](https://github.com/apache/datafusion-python/pull/293) (simicd) -- fix: Printed results not visible in debugger & notebooks [#296](https://github.com/apache/datafusion-python/pull/296) (simicd) -- add package.include and remove wildcard dependency [#295](https://github.com/apache/datafusion-python/pull/295) (andygrove) -- Update main branch name in docs workflow [#303](https://github.com/apache/datafusion-python/pull/303) (andygrove) -- Upgrade to DF 21 [#301](https://github.com/apache/datafusion-python/pull/301) (andygrove) - -## [20.0.0](https://github.com/apache/datafusion-python/tree/20.0.0) (2023-03-17) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/0.8.0...20.0.0) - -**Implemented enhancements:** - -- Empty relation bindings [#208](https://github.com/apache/datafusion-python/pull/208) (jdye64) -- wrap display_name and canonical_name functions [#214](https://github.com/apache/datafusion-python/pull/214) (jdye64) -- Add PyAlias bindings [#216](https://github.com/apache/datafusion-python/pull/216) (jdye64) -- Add bindings for scalar_variable [#218](https://github.com/apache/datafusion-python/pull/218) (jdye64) -- Bindings for LIKE type expressions [#220](https://github.com/apache/datafusion-python/pull/220) (jdye64) -- Bool expr bindings [#223](https://github.com/apache/datafusion-python/pull/223) (jdye64) -- Between bindings [#229](https://github.com/apache/datafusion-python/pull/229) (jdye64) -- Add bindings for GetIndexedField [#227](https://github.com/apache/datafusion-python/pull/227) (jdye64) -- Add bindings for case, cast, and trycast [#232](https://github.com/apache/datafusion-python/pull/232) (jdye64) -- add remaining expr bindings [#233](https://github.com/apache/datafusion-python/pull/233) (jdye64) -- feature: Additional export methods [#236](https://github.com/apache/datafusion-python/pull/236) (simicd) -- Add Python wrapper for LogicalPlan::Union [#240](https://github.com/apache/datafusion-python/pull/240) (iajoiner) -- feature: Create dataframe from pandas, polars, dictionary, list or pyarrow Table [#242](https://github.com/apache/datafusion-python/pull/242) (simicd) -- Add Python wrappers for `LogicalPlan::Join` and `LogicalPlan::CrossJoin` [#246](https://github.com/apache/datafusion-python/pull/246) (iajoiner) -- feature: Set table name from ctx functions [#260](https://github.com/apache/datafusion-python/pull/260) (simicd) -- Explain bindings [#264](https://github.com/apache/datafusion-python/pull/264) (jdye64) -- Extension bindings [#266](https://github.com/apache/datafusion-python/pull/266) (jdye64) -- Subquery alias bindings [#269](https://github.com/apache/datafusion-python/pull/269) (jdye64) -- Create memory table [#271](https://github.com/apache/datafusion-python/pull/271) (jdye64) -- Create view bindings [#273](https://github.com/apache/datafusion-python/pull/273) (jdye64) -- Re-export Datafusion dependencies [#277](https://github.com/apache/datafusion-python/pull/277) (jdye64) -- Distinct bindings [#275](https://github.com/apache/datafusion-python/pull/275) (jdye64) -- Drop table bindings [#283](https://github.com/apache/datafusion-python/pull/283) (jdye64) -- Bindings for LogicalPlan::Repartition [#285](https://github.com/apache/datafusion-python/pull/285) (jdye64) -- Expand Rust return type support for Arrow DataTypes in ScalarValue [#287](https://github.com/apache/datafusion-python/pull/287) (jdye64) - -**Documentation updates:** - -- docs: Example of calling Python UDF & UDAF in SQL [#258](https://github.com/apache/datafusion-python/pull/258) (simicd) - -**Merged pull requests:** - -- Minor docs updates [#210](https://github.com/apache/datafusion-python/pull/210) (andygrove) -- Empty relation bindings [#208](https://github.com/apache/datafusion-python/pull/208) (jdye64) -- wrap display_name and canonical_name functions [#214](https://github.com/apache/datafusion-python/pull/214) (jdye64) -- Add PyAlias bindings [#216](https://github.com/apache/datafusion-python/pull/216) (jdye64) -- Add bindings for scalar_variable [#218](https://github.com/apache/datafusion-python/pull/218) (jdye64) -- Bindings for LIKE type expressions [#220](https://github.com/apache/datafusion-python/pull/220) (jdye64) -- Bool expr bindings [#223](https://github.com/apache/datafusion-python/pull/223) (jdye64) -- Between bindings [#229](https://github.com/apache/datafusion-python/pull/229) (jdye64) -- Add bindings for GetIndexedField [#227](https://github.com/apache/datafusion-python/pull/227) (jdye64) -- Add bindings for case, cast, and trycast [#232](https://github.com/apache/datafusion-python/pull/232) (jdye64) -- add remaining expr bindings [#233](https://github.com/apache/datafusion-python/pull/233) (jdye64) -- Pre-commit hooks [#228](https://github.com/apache/datafusion-python/pull/228) (jdye64) -- Implement new release process [#149](https://github.com/apache/datafusion-python/pull/149) (andygrove) -- feature: Additional export methods [#236](https://github.com/apache/datafusion-python/pull/236) (simicd) -- Add Python wrapper for LogicalPlan::Union [#240](https://github.com/apache/datafusion-python/pull/240) (iajoiner) -- feature: Create dataframe from pandas, polars, dictionary, list or pyarrow Table [#242](https://github.com/apache/datafusion-python/pull/242) (simicd) -- Fix release instructions [#238](https://github.com/apache/datafusion-python/pull/238) (andygrove) -- Add Python wrappers for `LogicalPlan::Join` and `LogicalPlan::CrossJoin` [#246](https://github.com/apache/datafusion-python/pull/246) (iajoiner) -- docs: Example of calling Python UDF & UDAF in SQL [#258](https://github.com/apache/datafusion-python/pull/258) (simicd) -- feature: Set table name from ctx functions [#260](https://github.com/apache/datafusion-python/pull/260) (simicd) -- Upgrade to DataFusion 19 [#262](https://github.com/apache/datafusion-python/pull/262) (andygrove) -- Explain bindings [#264](https://github.com/apache/datafusion-python/pull/264) (jdye64) -- Extension bindings [#266](https://github.com/apache/datafusion-python/pull/266) (jdye64) -- Subquery alias bindings [#269](https://github.com/apache/datafusion-python/pull/269) (jdye64) -- Create memory table [#271](https://github.com/apache/datafusion-python/pull/271) (jdye64) -- Create view bindings [#273](https://github.com/apache/datafusion-python/pull/273) (jdye64) -- Re-export Datafusion dependencies [#277](https://github.com/apache/datafusion-python/pull/277) (jdye64) -- Distinct bindings [#275](https://github.com/apache/datafusion-python/pull/275) (jdye64) -- build(deps): bump actions/checkout from 2 to 3 [#244](https://github.com/apache/datafusion-python/pull/244) (dependabot[bot]) -- build(deps): bump actions/upload-artifact from 2 to 3 [#245](https://github.com/apache/datafusion-python/pull/245) (dependabot[bot]) -- build(deps): bump actions/download-artifact from 2 to 3 [#243](https://github.com/apache/datafusion-python/pull/243) (dependabot[bot]) -- Use DataFusion 20 [#278](https://github.com/apache/datafusion-python/pull/278) (andygrove) -- Drop table bindings [#283](https://github.com/apache/datafusion-python/pull/283) (jdye64) -- Bindings for LogicalPlan::Repartition [#285](https://github.com/apache/datafusion-python/pull/285) (jdye64) -- Expand Rust return type support for Arrow DataTypes in ScalarValue [#287](https://github.com/apache/datafusion-python/pull/287) (jdye64) - -## [0.8.0](https://github.com/apache/datafusion-python/tree/0.8.0) (2023-02-22) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/0.8.0-rc1...0.8.0) - -**Implemented enhancements:** - -- Add support for cuDF physical execution engine [\#202](https://github.com/apache/datafusion-python/issues/202) -- Make it easier to create a Pandas dataframe from DataFusion query results [\#139](https://github.com/apache/datafusion-python/issues/139) - -**Fixed bugs:** - -- Build error: could not compile `thiserror` due to 2 previous errors [\#69](https://github.com/apache/datafusion-python/issues/69) - -**Closed issues:** - -- Integrate with the new `object_store` crate [\#22](https://github.com/apache/datafusion-python/issues/22) - -**Merged pull requests:** - -- Update README in preparation for 0.8 release [\#206](https://github.com/apache/datafusion-python/pull/206) ([andygrove](https://github.com/andygrove)) -- Add support for cudf as a physical execution engine [\#205](https://github.com/apache/datafusion-python/pull/205) ([jdye64](https://github.com/jdye64)) -- Run `maturin develop` instead of `cargo build` in verification script [\#200](https://github.com/apache/datafusion-python/pull/200) ([andygrove](https://github.com/andygrove)) -- Add tests for recently added functionality [\#199](https://github.com/apache/datafusion-python/pull/199) ([andygrove](https://github.com/andygrove)) -- Implement `to_pandas()` [\#197](https://github.com/apache/datafusion-python/pull/197) ([simicd](https://github.com/simicd)) -- Add Python wrapper for LogicalPlan::Sort [\#196](https://github.com/apache/datafusion-python/pull/196) ([andygrove](https://github.com/andygrove)) -- Add Python wrapper for LogicalPlan::Aggregate [\#195](https://github.com/apache/datafusion-python/pull/195) ([andygrove](https://github.com/andygrove)) -- Add Python wrapper for LogicalPlan::Limit [\#193](https://github.com/apache/datafusion-python/pull/193) ([andygrove](https://github.com/andygrove)) -- Add Python wrapper for LogicalPlan::Filter [\#192](https://github.com/apache/datafusion-python/pull/192) ([andygrove](https://github.com/andygrove)) -- Add experimental support for executing SQL with Polars and Pandas [\#190](https://github.com/apache/datafusion-python/pull/190) ([andygrove](https://github.com/andygrove)) -- Update changelog for 0.8 release [\#188](https://github.com/apache/datafusion-python/pull/188) ([andygrove](https://github.com/andygrove)) -- Add ability to execute ExecutionPlan and get a stream of RecordBatch [\#186](https://github.com/apache/datafusion-python/pull/186) ([andygrove](https://github.com/andygrove)) -- Dffield bindings [\#185](https://github.com/apache/datafusion-python/pull/185) ([jdye64](https://github.com/jdye64)) -- Add bindings for DFSchema [\#183](https://github.com/apache/datafusion-python/pull/183) ([jdye64](https://github.com/jdye64)) -- test: Window functions [\#182](https://github.com/apache/datafusion-python/pull/182) ([simicd](https://github.com/simicd)) -- Add bindings for Projection [\#180](https://github.com/apache/datafusion-python/pull/180) ([jdye64](https://github.com/jdye64)) -- Table scan bindings [\#178](https://github.com/apache/datafusion-python/pull/178) ([jdye64](https://github.com/jdye64)) -- Make session configurable [\#176](https://github.com/apache/datafusion-python/pull/176) ([andygrove](https://github.com/andygrove)) -- Upgrade to DataFusion 18.0.0 [\#175](https://github.com/apache/datafusion-python/pull/175) ([andygrove](https://github.com/andygrove)) -- Use latest DataFusion rev in preparation for DF 18 release [\#174](https://github.com/apache/datafusion-python/pull/174) ([andygrove](https://github.com/andygrove)) -- Arrow type bindings [\#173](https://github.com/apache/datafusion-python/pull/173) ([jdye64](https://github.com/jdye64)) -- Pyo3 bump [\#171](https://github.com/apache/datafusion-python/pull/171) ([jdye64](https://github.com/jdye64)) -- feature: Add additional aggregation functions [\#170](https://github.com/apache/datafusion-python/pull/170) ([simicd](https://github.com/simicd)) -- Make from_substrait_plan return DataFrame instead of LogicalPlan [\#164](https://github.com/apache/datafusion-python/pull/164) ([andygrove](https://github.com/andygrove)) -- feature: Implement count method [\#163](https://github.com/apache/datafusion-python/pull/163) ([simicd](https://github.com/simicd)) -- CI Fixes [\#162](https://github.com/apache/datafusion-python/pull/162) ([jdye64](https://github.com/jdye64)) -- Upgrade to DataFusion 17 [\#160](https://github.com/apache/datafusion-python/pull/160) ([andygrove](https://github.com/andygrove)) -- feature: Improve string representation of datafusion classes [\#159](https://github.com/apache/datafusion-python/pull/159) ([simicd](https://github.com/simicd)) -- Make PyExecutionPlan.plan public [\#156](https://github.com/apache/datafusion-python/pull/156) ([andygrove](https://github.com/andygrove)) -- Expose methods on logical and execution plans [\#155](https://github.com/apache/datafusion-python/pull/155) ([andygrove](https://github.com/andygrove)) -- Fix clippy for new Rust version [\#154](https://github.com/apache/datafusion-python/pull/154) ([andygrove](https://github.com/andygrove)) -- Add DataFrame methods for accessing plans [\#153](https://github.com/apache/datafusion-python/pull/153) ([andygrove](https://github.com/andygrove)) -- Use DataFusion rev 5238e8c97f998b4d2cb9fab85fb182f325a1a7fb [\#150](https://github.com/apache/datafusion-python/pull/150) ([andygrove](https://github.com/andygrove)) -- build\(deps\): bump async-trait from 0.1.61 to 0.1.62 [\#148](https://github.com/apache/datafusion-python/pull/148) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Rename default branch from master to main [\#147](https://github.com/apache/datafusion-python/pull/147) ([andygrove](https://github.com/andygrove)) -- Substrait bindings [\#145](https://github.com/apache/datafusion-python/pull/145) ([jdye64](https://github.com/jdye64)) -- build\(deps\): bump uuid from 0.8.2 to 1.2.2 [\#143](https://github.com/apache/datafusion-python/pull/143) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Prepare for 0.8.0 release [\#141](https://github.com/apache/datafusion-python/pull/141) ([andygrove](https://github.com/andygrove)) -- Improve README and add more examples [\#137](https://github.com/apache/datafusion-python/pull/137) ([andygrove](https://github.com/andygrove)) -- test: Expand tests for built-in functions [\#129](https://github.com/apache/datafusion-python/pull/129) ([simicd](https://github.com/simicd)) -- build\(deps\): bump object_store from 0.5.2 to 0.5.3 [\#126](https://github.com/apache/datafusion-python/pull/126) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump mimalloc from 0.1.32 to 0.1.34 [\#125](https://github.com/apache/datafusion-python/pull/125) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Introduce conda directory containing datafusion-dev.yaml conda enviro… [\#124](https://github.com/apache/datafusion-python/pull/124) ([jdye64](https://github.com/jdye64)) -- build\(deps\): bump bzip2 from 0.4.3 to 0.4.4 [\#121](https://github.com/apache/datafusion-python/pull/121) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump tokio from 1.23.0 to 1.24.1 [\#119](https://github.com/apache/datafusion-python/pull/119) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump async-trait from 0.1.60 to 0.1.61 [\#118](https://github.com/apache/datafusion-python/pull/118) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Upgrade to DataFusion 16.0.0 [\#115](https://github.com/apache/datafusion-python/pull/115) ([andygrove](https://github.com/andygrove)) -- Bump async-trait from 0.1.57 to 0.1.60 [\#114](https://github.com/apache/datafusion-python/pull/114) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Bump object_store from 0.5.1 to 0.5.2 [\#112](https://github.com/apache/datafusion-python/pull/112) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Bump tokio from 1.21.2 to 1.23.0 [\#109](https://github.com/apache/datafusion-python/pull/109) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add entries for publishing production \(asf-site\) and staging docs [\#107](https://github.com/apache/datafusion-python/pull/107) ([martin-g](https://github.com/martin-g)) -- Add a workflow that builds the docs and deploys them at staged or production [\#104](https://github.com/apache/datafusion-python/pull/104) ([martin-g](https://github.com/martin-g)) -- Upgrade to DataFusion 15.0.0 [\#103](https://github.com/apache/datafusion-python/pull/103) ([andygrove](https://github.com/andygrove)) -- build\(deps\): bump futures from 0.3.24 to 0.3.25 [\#102](https://github.com/apache/datafusion-python/pull/102) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump pyo3 from 0.17.2 to 0.17.3 [\#101](https://github.com/apache/datafusion-python/pull/101) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump mimalloc from 0.1.30 to 0.1.32 [\#98](https://github.com/apache/datafusion-python/pull/98) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump rand from 0.7.3 to 0.8.5 [\#97](https://github.com/apache/datafusion-python/pull/97) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix GitHub actions warnings [\#95](https://github.com/apache/datafusion-python/pull/95) ([martin-g](https://github.com/martin-g)) -- Fixes \#81 - Add CI workflow for source distribution [\#93](https://github.com/apache/datafusion-python/pull/93) ([martin-g](https://github.com/martin-g)) -- post-release updates [\#91](https://github.com/apache/datafusion-python/pull/91) ([andygrove](https://github.com/andygrove)) -- Build for manylinux 2014 [\#88](https://github.com/apache/datafusion-python/pull/88) ([martin-g](https://github.com/martin-g)) -- update release readme tag [\#86](https://github.com/apache/datafusion-python/pull/86) ([Jimexist](https://github.com/Jimexist)) -- Upgrade Maturin to 0.14.2 [\#85](https://github.com/apache/datafusion-python/pull/85) ([martin-g](https://github.com/martin-g)) -- Update release instructions [\#83](https://github.com/apache/datafusion-python/pull/83) ([andygrove](https://github.com/andygrove)) -- \[Functions\] - Add python function binding to `functions` [\#73](https://github.com/apache/datafusion-python/pull/73) ([francis-du](https://github.com/francis-du)) - -## [0.8.0-rc1](https://github.com/apache/datafusion-python/tree/0.8.0-rc1) (2023-02-17) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/0.7.0-rc2...0.8.0-rc1) - -**Implemented enhancements:** - -- Add bindings for datafusion_common::DFField [\#184](https://github.com/apache/datafusion-python/issues/184) -- Add bindings for DFSchema/DFSchemaRef [\#181](https://github.com/apache/datafusion-python/issues/181) -- Add bindings for datafusion_expr Projection [\#179](https://github.com/apache/datafusion-python/issues/179) -- Add bindings for `TableScan` struct from `datafusion_expr::TableScan` [\#177](https://github.com/apache/datafusion-python/issues/177) -- Add a "mapping" struct for types [\#172](https://github.com/apache/datafusion-python/issues/172) -- Improve string representation of datafusion classes \(dataframe, context, expression, ...\) [\#158](https://github.com/apache/datafusion-python/issues/158) -- Add DataFrame count method [\#151](https://github.com/apache/datafusion-python/issues/151) -- \[REQUEST\] Github Actions Improvements [\#146](https://github.com/apache/datafusion-python/issues/146) -- Change default branch name from master to main [\#144](https://github.com/apache/datafusion-python/issues/144) -- Bump pyo3 to 0.18.0 [\#140](https://github.com/apache/datafusion-python/issues/140) -- Add script for Python linting [\#134](https://github.com/apache/datafusion-python/issues/134) -- Add Python bindings for substrait module [\#132](https://github.com/apache/datafusion-python/issues/132) -- Expand unit tests for built-in functions [\#128](https://github.com/apache/datafusion-python/issues/128) -- support creating arrow-datafusion-python conda environment [\#122](https://github.com/apache/datafusion-python/issues/122) -- Build Python source distribution in GitHub workflow [\#81](https://github.com/apache/datafusion-python/issues/81) -- EPIC: Add all functions to python binding `functions` [\#72](https://github.com/apache/datafusion-python/issues/72) - -**Fixed bugs:** - -- Build is broken [\#161](https://github.com/apache/datafusion-python/issues/161) -- Out of memory when sorting [\#157](https://github.com/apache/datafusion-python/issues/157) -- window_lead test appears to be non-deterministic [\#135](https://github.com/apache/datafusion-python/issues/135) -- Reading csv does not work [\#130](https://github.com/apache/datafusion-python/issues/130) -- Github actions produce a lot of warnings [\#94](https://github.com/apache/datafusion-python/issues/94) -- ASF source release tarball has wrong directory name [\#90](https://github.com/apache/datafusion-python/issues/90) -- Python Release Build failing after upgrading to maturin 14.2 [\#87](https://github.com/apache/datafusion-python/issues/87) -- Maturin build hangs on Linux ARM64 [\#84](https://github.com/apache/datafusion-python/issues/84) -- Cannot install on Mac M1 from source tarball from testpypi [\#82](https://github.com/apache/datafusion-python/issues/82) -- ImportPathMismatchError when running pytest locally [\#77](https://github.com/apache/datafusion-python/issues/77) - -**Closed issues:** - -- Publish documentation for Python bindings [\#39](https://github.com/apache/datafusion-python/issues/39) -- Add Python binding for `approx_median` [\#32](https://github.com/apache/datafusion-python/issues/32) -- Release version 0.7.0 [\#7](https://github.com/apache/datafusion-python/issues/7) - -## [0.7.0-rc2](https://github.com/apache/datafusion-python/tree/0.7.0-rc2) (2022-11-26) - -[Full Changelog](https://github.com/apache/datafusion-python/compare/0.7.0...0.7.0-rc2) - -## [Unreleased](https://github.com/datafusion-contrib/datafusion-python/tree/HEAD) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1...HEAD) - -**Merged pull requests:** - -- use \_\_getitem\_\_ for df column selection [\#41](https://github.com/datafusion-contrib/datafusion-python/pull/41) ([Jimexist](https://github.com/Jimexist)) -- fix demo in readme [\#40](https://github.com/datafusion-contrib/datafusion-python/pull/40) ([Jimexist](https://github.com/Jimexist)) -- Implement select_columns [\#39](https://github.com/datafusion-contrib/datafusion-python/pull/39) ([andygrove](https://github.com/andygrove)) -- update readme and changelog [\#38](https://github.com/datafusion-contrib/datafusion-python/pull/38) ([Jimexist](https://github.com/Jimexist)) -- Add PyDataFrame.explain [\#36](https://github.com/datafusion-contrib/datafusion-python/pull/36) ([andygrove](https://github.com/andygrove)) -- Release 0.5.0 [\#34](https://github.com/datafusion-contrib/datafusion-python/pull/34) ([Jimexist](https://github.com/Jimexist)) -- disable nightly in workflow [\#33](https://github.com/datafusion-contrib/datafusion-python/pull/33) ([Jimexist](https://github.com/Jimexist)) -- update requirements to 37 and 310, update readme [\#32](https://github.com/datafusion-contrib/datafusion-python/pull/32) ([Jimexist](https://github.com/Jimexist)) -- Add custom global allocator [\#30](https://github.com/datafusion-contrib/datafusion-python/pull/30) ([matthewmturner](https://github.com/matthewmturner)) -- Remove pandas dependency [\#25](https://github.com/datafusion-contrib/datafusion-python/pull/25) ([matthewmturner](https://github.com/matthewmturner)) -- upgrade datafusion and pyo3 [\#20](https://github.com/datafusion-contrib/datafusion-python/pull/20) ([Jimexist](https://github.com/Jimexist)) -- update maturin 0.12+ [\#17](https://github.com/datafusion-contrib/datafusion-python/pull/17) ([Jimexist](https://github.com/Jimexist)) -- Update README.md [\#16](https://github.com/datafusion-contrib/datafusion-python/pull/16) ([Jimexist](https://github.com/Jimexist)) -- apply cargo clippy --fix [\#15](https://github.com/datafusion-contrib/datafusion-python/pull/15) ([Jimexist](https://github.com/Jimexist)) -- update test workflow to include rust clippy and check [\#14](https://github.com/datafusion-contrib/datafusion-python/pull/14) ([Jimexist](https://github.com/Jimexist)) -- use maturin 0.12.6 [\#13](https://github.com/datafusion-contrib/datafusion-python/pull/13) ([Jimexist](https://github.com/Jimexist)) -- apply cargo fmt [\#12](https://github.com/datafusion-contrib/datafusion-python/pull/12) ([Jimexist](https://github.com/Jimexist)) -- use stable not nightly [\#11](https://github.com/datafusion-contrib/datafusion-python/pull/11) ([Jimexist](https://github.com/Jimexist)) -- ci: test against more compilers, setup clippy and fix clippy lints [\#9](https://github.com/datafusion-contrib/datafusion-python/pull/9) ([cpcloud](https://github.com/cpcloud)) -- Fix use of importlib.metadata and unify requirements.txt [\#8](https://github.com/datafusion-contrib/datafusion-python/pull/8) ([cpcloud](https://github.com/cpcloud)) -- Ship the Cargo.lock file in the source distribution [\#7](https://github.com/datafusion-contrib/datafusion-python/pull/7) ([cpcloud](https://github.com/cpcloud)) -- add \_\_version\_\_ attribute to datafusion object [\#3](https://github.com/datafusion-contrib/datafusion-python/pull/3) ([tfeda](https://github.com/tfeda)) -- fix ci by fixing directories [\#2](https://github.com/datafusion-contrib/datafusion-python/pull/2) ([Jimexist](https://github.com/Jimexist)) -- setup workflow [\#1](https://github.com/datafusion-contrib/datafusion-python/pull/1) ([Jimexist](https://github.com/Jimexist)) - -## [0.5.1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1) (2022-03-15) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1-rc1...0.5.1) - -## [0.5.1-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1-rc1) (2022-03-15) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0...0.5.1-rc1) - -## [0.5.0](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0) (2022-03-10) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc2...0.5.0) - -## [0.5.0-rc2](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc2) (2022-03-10) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc1...0.5.0-rc2) - -**Closed issues:** - -- Add support for Ballista [\#37](https://github.com/datafusion-contrib/datafusion-python/issues/37) -- Implement DataFrame.explain [\#35](https://github.com/datafusion-contrib/datafusion-python/issues/35) - -## [0.5.0-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc1) (2022-03-09) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/4c98b8e9c3c3f8e2e6a8f2d1ffcfefda344c4680...0.5.0-rc1) - -**Closed issues:** - -- Investigate exposing additional optimizations [\#28](https://github.com/datafusion-contrib/datafusion-python/issues/28) -- Use custom allocator in Python build [\#27](https://github.com/datafusion-contrib/datafusion-python/issues/27) -- Why is pandas a requirement? [\#24](https://github.com/datafusion-contrib/datafusion-python/issues/24) -- Unable to build [\#18](https://github.com/datafusion-contrib/datafusion-python/issues/18) -- Setup CI against multiple Python version [\#6](https://github.com/datafusion-contrib/datafusion-python/issues/6) +The changelogs have now moved [here](./dev/changelog). diff --git a/Cargo.lock b/Cargo.lock index 7b57b330a..d1f291be9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,9 +111,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.19" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "611cc2ae7d2e242c457e4be7f97036b8ad9ca152b499f53faf99b1ed8fc2553f" +checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" [[package]] name = "android-tzdata" @@ -157,7 +157,7 @@ dependencies = [ "snap", "strum 0.25.0", "strum_macros 0.25.3", - "thiserror", + "thiserror 1.0.69", "typed-builder", "uuid", "xz2", @@ -178,9 +178,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4caf25cdc4a985f91df42ed9e9308e1adbcd341a31a72605c697033fcef163e3" +checksum = "c91839b07e474b3995035fd8ac33ee54f9c9ccbbb1ea33d9909c71bffdf1259d" dependencies = [ "arrow-arith", "arrow-array", @@ -200,9 +200,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91f2dfd1a7ec0aca967dfaa616096aec49779adc8eccec005e2f5e4111b1192a" +checksum = "855c57c4efd26722b044dcd3e348252560e3e0333087fb9f6479dc0bf744054f" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d39387ca628be747394890a6e47f138ceac1aa912eab64f02519fed24b637af8" +checksum = "bd03279cea46569acf9295f6224fbc370c5df184b4d2ecfe97ccb131d5615a7f" dependencies = [ "ahash", "arrow-buffer", @@ -226,15 +226,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.14.5", + "hashbrown 0.15.2", "num", ] [[package]] name = "arrow-buffer" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e51e05228852ffe3eb391ce7178a0f97d2cf80cc6ef91d3c4a6b3cb688049ec" +checksum = "9e4a9b9b1d6d7117f6138e13bc4dd5daa7f94e671b70e8c9c4dc37b4f5ecfc16" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d09aea56ec9fa267f3f3f6cdab67d8a9974cbba90b3aa38c8fe9d0bb071bd8c1" +checksum = "bc70e39916e60c5b7af7a8e2719e3ae589326039e1e863675a008bee5ffe90fd" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c07b5232be87d115fde73e32f2ca7f1b353bff1b44ac422d3c6fc6ae38f11f0d" +checksum = "789b2af43c1049b03a8d088ff6b2257cdcea1756cd76b174b1f2600356771b97" dependencies = [ "arrow-array", "arrow-buffer", @@ -283,9 +283,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b98ae0af50890b494cebd7d6b04b35e896205c1d1df7b29a6272c5d0d0249ef5" +checksum = "e4e75edf21ffd53744a9b8e3ed11101f610e7ceb1a29860432824f1834a1f623" dependencies = [ "arrow-buffer", "arrow-schema", @@ -295,9 +295,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ed91bdeaff5a1c00d28d8f73466bcb64d32bbd7093b5a30156b4b9f4dba3eee" +checksum = "d186a909dece9160bf8312f5124d797884f608ef5435a36d9d608e0b2a9bcbf8" dependencies = [ "arrow-array", "arrow-buffer", @@ -310,9 +310,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0471f51260a5309307e5d409c9dc70aede1cd9cf1d4ff0f0a1e8e1a2dd0e0d3c" +checksum = "b66ff2fedc1222942d0bd2fd391cb14a85baa3857be95c9373179bd616753b85" dependencies = [ "arrow-array", "arrow-buffer", @@ -330,9 +330,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2883d7035e0b600fb4c30ce1e50e66e53d8656aa729f2bfa4b51d359cf3ded52" +checksum = "ece7b5bc1180e6d82d1a60e1688c199829e8842e38497563c3ab6ea813e527fd" dependencies = [ "arrow-array", "arrow-buffer", @@ -345,9 +345,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552907e8e587a6fde4f8843fd7a27a576a260f65dab6c065741ea79f633fc5be" +checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" dependencies = [ "ahash", "arrow-array", @@ -359,18 +359,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "539ada65246b949bd99ffa0881a9a15a4a529448af1a07a9838dd78617dafab1" +checksum = "b95513080e728e4cec37f1ff5af4f12c9688d47795d17cda80b6ec2cf74d4678" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "arrow-select" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6259e566b752da6dceab91766ed8b2e67bf6270eb9ad8a6e07a33c1bede2b125" +checksum = "8e415279094ea70323c032c6e739c48ad8d80e78a09bef7117b8718ad5bf3722" dependencies = [ "ahash", "arrow-array", @@ -382,9 +382,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3179ccbd18ebf04277a095ba7321b93fd1f774f18816bd5f6b3ce2f594edb6c" +checksum = "11d956cae7002eb8d83a27dbd34daaea1cf5b75852f0b84deb4d93a276e92bbf" dependencies = [ "arrow-array", "arrow-buffer", @@ -411,9 +411,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.17" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" +checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" dependencies = [ "bzip2", "flate2", @@ -444,7 +444,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -455,7 +455,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -529,9 +529,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" dependencies = [ "arrayref", "arrayvec", @@ -584,9 +584,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" [[package]] name = "bzip2" @@ -611,9 +611,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.37" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40545c26d092346d8a8dab71ee48e7685a7a9cba76e634790c215b41a4a7b4cf" +checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc" dependencies = [ "jobserver", "libc", @@ -668,18 +668,18 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.51" +version = "0.1.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +checksum = "c682c223677e0e5b6b7f63a64b9351844c3f1b1678a68b7ee617e30fb082620e" dependencies = [ "cc", ] [[package]] name = "comfy-table" -version = "7.1.1" +version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" dependencies = [ "strum 0.26.3", "strum_macros 0.26.4", @@ -720,9 +720,9 @@ checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "core-foundation" -version = "0.9.4" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63" dependencies = [ "core-foundation-sys", "libc", @@ -760,9 +760,9 @@ checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6" [[package]] name = "cpufeatures" -version = "0.2.14" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" dependencies = [ "libc", ] @@ -809,9 +809,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -1288,7 +1288,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "42.0.0" +version = "43.0.0" dependencies = [ "arrow", "async-trait", @@ -1364,7 +1364,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -1393,12 +1393,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1425,9 +1425,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.34" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", "miniz_oxide", @@ -1504,7 +1504,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -1563,8 +1563,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] @@ -1581,9 +1583,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" +checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" dependencies = [ "atomic-waker", "bytes", @@ -1621,9 +1623,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" [[package]] name = "heck" @@ -1697,9 +1699,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a" +checksum = "97818827ef4f364230e16705d4706e2897df2bb60617d6ca15d598025a3c481f" dependencies = [ "bytes", "futures-channel", @@ -1890,7 +1892,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -1921,7 +1923,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.15.1", + "hashbrown 0.15.2", ] [[package]] @@ -1965,9 +1967,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "jobserver" @@ -1980,10 +1982,11 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -2059,9 +2062,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.162" +version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" [[package]] name = "libflate" @@ -2121,9 +2124,9 @@ checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "litemap" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" [[package]] name = "lock_api" @@ -2212,11 +2215,10 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ - "hermit-abi", "libc", "wasi", "windows-sys 0.52.0", @@ -2398,9 +2400,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.2.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dea02606ba6f5e856561d8d507dba8bac060aefca2a6c0f1aa1d361fed91ff3e" +checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" dependencies = [ "ahash", "arrow-array", @@ -2417,7 +2419,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.14.5", + "hashbrown 0.15.2", "lz4_flex", "num", "num-bigint", @@ -2558,9 +2560,9 @@ checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "portable-atomic" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" [[package]] name = "ppv-lite86" @@ -2578,14 +2580,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033" dependencies = [ "proc-macro2", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] name = "proc-macro2" -version = "1.0.89" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -2617,7 +2619,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.87", + "syn 2.0.90", "tempfile", ] @@ -2631,7 +2633,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -2699,7 +2701,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -2712,7 +2714,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -2733,9 +2735,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" +checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" dependencies = [ "bytes", "pin-project-lite", @@ -2744,26 +2746,29 @@ dependencies = [ "rustc-hash", "rustls", "socket2", - "thiserror", + "thiserror 2.0.3", "tokio", "tracing", ] [[package]] name = "quinn-proto" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" +checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", + "getrandom", "rand", "ring", "rustc-hash", "rustls", + "rustls-pki-types", "slab", - "thiserror", + "thiserror 2.0.3", "tinyvec", "tracing", + "web-time", ] [[package]] @@ -2842,9 +2847,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -2956,9 +2961,9 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" [[package]] name = "rustc_version" @@ -2971,9 +2976,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.39" +version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "375116bee2be9ed569afe2154ea6a99dfdffd257f533f187498c2a8f5feaf4ee" +checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ "bitflags 2.6.0", "errno", @@ -2984,9 +2989,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.16" +version = "0.23.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" +checksum = "934b404430bb06b3fae2cba809eb45a1ab1aecd64491213d7c3301b88393f8d1" dependencies = [ "once_cell", "ring", @@ -2998,12 +3003,11 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" dependencies = [ "openssl-probe", - "rustls-pemfile", "rustls-pki-types", "schannel", "security-framework", @@ -3023,6 +3027,9 @@ name = "rustls-pki-types" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +dependencies = [ + "web-time", +] [[package]] name = "rustls-webpki" @@ -3058,9 +3065,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" dependencies = [ "windows-sys 0.59.0", ] @@ -3086,7 +3093,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3097,9 +3104,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "2.11.1" +version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +checksum = "e1415a607e92bec364ea2cf9264646dcce0f91e6d65281bd6f2819cca3bf39c8" dependencies = [ "bitflags 2.6.0", "core-foundation", @@ -3135,22 +3142,22 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3161,14 +3168,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.133" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" dependencies = [ "itoa", "memchr", @@ -3185,7 +3192,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3269,7 +3276,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3280,9 +3287,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" dependencies = [ "libc", "windows-sys 0.52.0", @@ -3312,7 +3319,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3352,7 +3359,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3365,7 +3372,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3389,7 +3396,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.87", + "syn 2.0.90", "typify", "walkdir", ] @@ -3413,9 +3420,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.87" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -3424,9 +3431,9 @@ dependencies = [ [[package]] name = "sync_wrapper" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" dependencies = [ "futures-core", ] @@ -3439,7 +3446,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3463,22 +3470,42 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.68" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02dd99dc800bbb97186339685293e1cc5d9df1f8fae2d0aecd9ff1c77efea892" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" dependencies = [ - "thiserror-impl", + "thiserror-impl 2.0.3", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", ] [[package]] name = "thiserror-impl" -version = "1.0.68" +version = "2.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7c61ec9a6f64d2793d8a45faba21efbe3ced62a886d44c36a009b2b519b4c7e" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3550,7 +3577,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3585,9 +3612,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -3596,20 +3623,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", ] @@ -3668,7 +3695,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] @@ -3702,8 +3729,8 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.87", - "thiserror", + "syn 2.0.90", + "thiserror 1.0.69", "unicode-ident", ] @@ -3720,15 +3747,15 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.87", + "syn 2.0.90", "typify-impl", ] [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "unicode-segmentation" @@ -3738,9 +3765,9 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.14" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "unindent" @@ -3762,9 +3789,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.3" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -3826,9 +3853,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c" dependencies = [ "cfg-if", "once_cell", @@ -3837,36 +3864,37 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.45" +version = "0.4.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" +checksum = "9dfaf8f50e5f293737ee323940c7d8b08a66a95a419223d9f41610ca08b0833d" dependencies = [ "cfg-if", "js-sys", + "once_cell", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3874,22 +3902,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49" [[package]] name = "wasm-streams" @@ -3906,9 +3934,19 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.72" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "a98bc3c33f0fe7e59ad7cd041b89034fa82a7c2d4365ca538dda6cdaf513863c" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" dependencies = [ "js-sys", "wasm-bindgen", @@ -4089,9 +4127,9 @@ dependencies = [ [[package]] name = "yoke" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" dependencies = [ "serde", "stable_deref_trait", @@ -4101,13 +4139,13 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", "synstructure", ] @@ -4129,27 +4167,27 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] name = "zerofrom" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", "synstructure", ] @@ -4178,7 +4216,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.90", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 02707b957..703fc5a26 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "42.0.0" +version = "43.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] diff --git a/dev/changelog/43.0.0.md b/dev/changelog/43.0.0.md new file mode 100644 index 000000000..bbb766910 --- /dev/null +++ b/dev/changelog/43.0.0.md @@ -0,0 +1,73 @@ + + +# Apache DataFusion Python 43.0.0 Changelog + +This release consists of 26 commits from 7 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: expose `drop` method [#913](https://github.com/apache/datafusion-python/pull/913) (ion-elgreco) +- feat: expose `join_on` [#914](https://github.com/apache/datafusion-python/pull/914) (ion-elgreco) +- feat: add fill_null/nan expressions [#919](https://github.com/apache/datafusion-python/pull/919) (ion-elgreco) +- feat: add `with_columns` [#909](https://github.com/apache/datafusion-python/pull/909) (ion-elgreco) +- feat: add `cast` to DataFrame [#916](https://github.com/apache/datafusion-python/pull/916) (ion-elgreco) +- feat: add `head`, `tail` methods [#915](https://github.com/apache/datafusion-python/pull/915) (ion-elgreco) + +**Fixed bugs:** + +- fix: remove use of deprecated `make_scalar_function` [#906](https://github.com/apache/datafusion-python/pull/906) (Michael-J-Ward) +- fix: udwf example [#948](https://github.com/apache/datafusion-python/pull/948) (mesejo) + +**Other:** + +- Ts/minor updates release process [#903](https://github.com/apache/datafusion-python/pull/903) (timsaucer) +- build(deps): bump pyo3 from 0.22.3 to 0.22.4 [#910](https://github.com/apache/datafusion-python/pull/910) (dependabot[bot]) +- refactor: `from_arrow` use protocol typehints [#917](https://github.com/apache/datafusion-python/pull/917) (ion-elgreco) +- Change requires-python version in pyproject.toml [#924](https://github.com/apache/datafusion-python/pull/924) (kosiew) +- chore: deprecate `select_columns` [#911](https://github.com/apache/datafusion-python/pull/911) (ion-elgreco) +- build(deps): bump uuid from 1.10.0 to 1.11.0 [#927](https://github.com/apache/datafusion-python/pull/927) (dependabot[bot]) +- Add array_empty scalar function [#931](https://github.com/apache/datafusion-python/pull/931) (kosiew) +- add `cardinality` function to calculate total distinct elements in an array [#937](https://github.com/apache/datafusion-python/pull/937) (kosiew) +- Add empty scalar function (alias of array_empty), fix a small typo [#938](https://github.com/apache/datafusion-python/pull/938) (kosiew) +- README How to develop section now also works on Apple M1 [#940](https://github.com/apache/datafusion-python/pull/940) (drauschenbach) +- refactor: dataframe `join` params [#912](https://github.com/apache/datafusion-python/pull/912) (ion-elgreco) +- Upgrade to Datafusion 43 [#905](https://github.com/apache/datafusion-python/pull/905) (Michael-J-Ward) +- build(deps): bump tokio from 1.40.0 to 1.41.1 [#946](https://github.com/apache/datafusion-python/pull/946) (dependabot[bot]) +- Add list_cat, list_concat, list_repeat [#942](https://github.com/apache/datafusion-python/pull/942) (kosiew) +- Add foreign table providers [#921](https://github.com/apache/datafusion-python/pull/921) (timsaucer) +- Add make_list and tests for make_list, make_array [#949](https://github.com/apache/datafusion-python/pull/949) (kosiew) +- Documentation updates: simplify examples and add section on data sources [#955](https://github.com/apache/datafusion-python/pull/955) (timsaucer) +- Add datafusion.extract [#959](https://github.com/apache/datafusion-python/pull/959) (kosiew) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 9 Ion Koutsouris + 7 kosiew + 3 Tim Saucer + 3 dependabot[bot] + 2 Michael J Ward + 1 Daniel Mesejo + 1 David Rauschenbach +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/dev/changelog/pre-43.0.0.md b/dev/changelog/pre-43.0.0.md new file mode 100644 index 000000000..ae3a2348a --- /dev/null +++ b/dev/changelog/pre-43.0.0.md @@ -0,0 +1,715 @@ + + +# DataFusion Python Changelog + +## [42.0.0](https://github.com/apache/datafusion-python/tree/42.0.0) (2024-10-06) + +This release consists of 20 commits from 6 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: expose between [#868](https://github.com/apache/datafusion-python/pull/868) (mesejo) +- feat: make register_csv accept a list of paths [#883](https://github.com/apache/datafusion-python/pull/883) (mesejo) +- feat: expose http object store [#885](https://github.com/apache/datafusion-python/pull/885) (mesejo) + +**Fixed bugs:** + +- fix: Calling `count` on a pyarrow dataset results in an error [#843](https://github.com/apache/datafusion-python/pull/843) (Michael-J-Ward) + +**Other:** + +- Upgrade datafusion [#867](https://github.com/apache/datafusion-python/pull/867) (emgeee) +- Feature/aggregates as windows [#871](https://github.com/apache/datafusion-python/pull/871) (timsaucer) +- Fix regression on register_udaf [#878](https://github.com/apache/datafusion-python/pull/878) (timsaucer) +- build(deps): upgrade setup-protoc action and protoc version number [#873](https://github.com/apache/datafusion-python/pull/873) (Michael-J-Ward) +- build(deps): bump prost-types from 0.13.2 to 0.13.3 [#881](https://github.com/apache/datafusion-python/pull/881) (dependabot[bot]) +- build(deps): bump prost from 0.13.2 to 0.13.3 [#882](https://github.com/apache/datafusion-python/pull/882) (dependabot[bot]) +- chore: remove XFAIL from passing tests [#884](https://github.com/apache/datafusion-python/pull/884) (Michael-J-Ward) +- Add user defined window function support [#880](https://github.com/apache/datafusion-python/pull/880) (timsaucer) +- build(deps): bump syn from 2.0.77 to 2.0.79 [#886](https://github.com/apache/datafusion-python/pull/886) (dependabot[bot]) +- fix example of reading parquet from s3 [#896](https://github.com/apache/datafusion-python/pull/896) (sir-sigurd) +- release-testing [#889](https://github.com/apache/datafusion-python/pull/889) (Michael-J-Ward) +- chore(bench): fix create_tables.sql for tpch benchmark [#897](https://github.com/apache/datafusion-python/pull/897) (Michael-J-Ward) +- Add physical and logical plan conversion to and from protobuf [#892](https://github.com/apache/datafusion-python/pull/892) (timsaucer) +- Feature/instance udfs [#890](https://github.com/apache/datafusion-python/pull/890) (timsaucer) +- chore(ci): remove Mambaforge variant from CI [#894](https://github.com/apache/datafusion-python/pull/894) (Michael-J-Ward) +- Use OnceLock to store TokioRuntime [#895](https://github.com/apache/datafusion-python/pull/895) (Michael-J-Ward) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Michael J Ward + 5 Tim Saucer + 3 Daniel Mesejo + 3 dependabot[bot] + 1 Matt Green + 1 Sergey Fedoseev +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + +## [41.0.0](https://github.com/apache/datafusion-python/tree/41.0.0) (2024-09-09) + +This release consists of 19 commits from 6 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: enable list of paths for read_csv [#824](https://github.com/apache/datafusion-python/pull/824) (mesejo) +- feat: better exception and message for table not found [#851](https://github.com/apache/datafusion-python/pull/851) (mesejo) +- feat: make cast accept built-in Python types [#858](https://github.com/apache/datafusion-python/pull/858) (mesejo) + +**Other:** + +- chore: Prepare for 40.0.0 release [#801](https://github.com/apache/datafusion-python/pull/801) (andygrove) +- Add typing-extensions dependency to pyproject [#805](https://github.com/apache/datafusion-python/pull/805) (timsaucer) +- Upgrade deps to datafusion 41 [#802](https://github.com/apache/datafusion-python/pull/802) (Michael-J-Ward) +- Fix SessionContext init with only SessionConfig [#827](https://github.com/apache/datafusion-python/pull/827) (jcrist) +- build(deps): upgrade actions/{upload,download}-artifact@v3 to v4 [#829](https://github.com/apache/datafusion-python/pull/829) (Michael-J-Ward) +- Run ruff format in CI [#837](https://github.com/apache/datafusion-python/pull/837) (timsaucer) +- Add PyCapsule support for Arrow import and export [#825](https://github.com/apache/datafusion-python/pull/825) (timsaucer) +- Feature/expose when function [#836](https://github.com/apache/datafusion-python/pull/836) (timsaucer) +- Add Window Functions for use with function builder [#808](https://github.com/apache/datafusion-python/pull/808) (timsaucer) +- chore: fix typos [#844](https://github.com/apache/datafusion-python/pull/844) (mesejo) +- build(ci): use proper mac runners [#841](https://github.com/apache/datafusion-python/pull/841) (Michael-J-Ward) +- Set of small features [#839](https://github.com/apache/datafusion-python/pull/839) (timsaucer) +- chore: fix docstrings, typos [#852](https://github.com/apache/datafusion-python/pull/852) (mesejo) +- chore: Use datafusion re-exported dependencies [#856](https://github.com/apache/datafusion-python/pull/856) (emgeee) +- add guidelines on separating python and rust code [#860](https://github.com/apache/datafusion-python/pull/860) (Michael-J-Ward) +- Update Aggregate functions to take builder parameters [#859](https://github.com/apache/datafusion-python/pull/859) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Tim Saucer + 5 Daniel Mesejo + 4 Michael J Ward + 1 Andy Grove + 1 Jim Crist-Harif + 1 Matt Green +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + +## [40.0.0](https://github.com/apache/datafusion-python/tree/40.0.0) (2024-08-09) + +This release consists of 18 commits from 4 contributors. See credits at the end of this changelog for more information. + +- Update changelog for 39.0.0 [#742](https://github.com/apache/datafusion-python/pull/742) (andygrove) +- build(deps): bump uuid from 1.8.0 to 1.9.1 [#744](https://github.com/apache/datafusion-python/pull/744) (dependabot[bot]) +- build(deps): bump mimalloc from 0.1.42 to 0.1.43 [#745](https://github.com/apache/datafusion-python/pull/745) (dependabot[bot]) +- build(deps): bump syn from 2.0.67 to 2.0.68 [#746](https://github.com/apache/datafusion-python/pull/746) (dependabot[bot]) +- Tsaucer/find window fn [#747](https://github.com/apache/datafusion-python/pull/747) (timsaucer) +- Python wrapper classes for all user interfaces [#750](https://github.com/apache/datafusion-python/pull/750) (timsaucer) +- Expose array sort [#764](https://github.com/apache/datafusion-python/pull/764) (timsaucer) +- Upgrade protobuf and remove GH Action googletest-installer [#773](https://github.com/apache/datafusion-python/pull/773) (Michael-J-Ward) +- Upgrade Datafusion 40 [#771](https://github.com/apache/datafusion-python/pull/771) (Michael-J-Ward) +- Bugfix: Calling count with None arguments [#768](https://github.com/apache/datafusion-python/pull/768) (timsaucer) +- Add in user example that compares a two different approaches to UDFs [#770](https://github.com/apache/datafusion-python/pull/770) (timsaucer) +- Add missing exports for wrapper modules [#782](https://github.com/apache/datafusion-python/pull/782) (timsaucer) +- Add PyExpr to_variant conversions [#793](https://github.com/apache/datafusion-python/pull/793) (Michael-J-Ward) +- Add missing expressions to wrapper export [#795](https://github.com/apache/datafusion-python/pull/795) (timsaucer) +- Doc/cross reference [#791](https://github.com/apache/datafusion-python/pull/791) (timsaucer) +- Re-Enable `num_centroids` to `approx_percentile_cont` [#798](https://github.com/apache/datafusion-python/pull/798) (Michael-J-Ward) +- UDAF process all state variables [#799](https://github.com/apache/datafusion-python/pull/799) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 9 Tim Saucer + 4 Michael J Ward + 3 dependabot[bot] + 2 Andy Grove +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + +## [39.0.0](https://github.com/apache/datafusion-python/tree/39.0.0) (2024-06-25) + +**Merged pull requests:** + +- ci: add substrait feature to linux builds [#720](https://github.com/apache/datafusion-python/pull/720) (Michael-J-Ward) +- Docs deploy action [#721](https://github.com/apache/datafusion-python/pull/721) (Michael-J-Ward) +- update deps [#723](https://github.com/apache/datafusion-python/pull/723) (Michael-J-Ward) +- Upgrade maturin [#725](https://github.com/apache/datafusion-python/pull/725) (Michael-J-Ward) +- Upgrade datafusion 39 [#728](https://github.com/apache/datafusion-python/pull/728) (Michael-J-Ward) +- use ScalarValue::to_pyarrow to convert to python object [#731](https://github.com/apache/datafusion-python/pull/731) (Michael-J-Ward) +- Pyo3 `Bound<'py, T>` api [#734](https://github.com/apache/datafusion-python/pull/734) (Michael-J-Ward) +- github test action: drop python 3.7, add python 3.12 [#736](https://github.com/apache/datafusion-python/pull/736) (Michael-J-Ward) +- Pyarrow filter pushdowns [#735](https://github.com/apache/datafusion-python/pull/735) (Michael-J-Ward) +- build(deps): bump syn from 2.0.66 to 2.0.67 [#738](https://github.com/apache/datafusion-python/pull/738) (dependabot[bot]) +- Pyo3 refactorings [#740](https://github.com/apache/datafusion-python/pull/740) (Michael-J-Ward) +- UDAF `sum` workaround [#741](https://github.com/apache/datafusion-python/pull/741) (Michael-J-Ward) + +## [38.0.1](https://github.com/apache/datafusion-python/tree/38.0.1) (2024-05-25) + +**Implemented enhancements:** + +- feat: add python bindings for ends_with function [#693](https://github.com/apache/datafusion-python/pull/693) (richtia) +- feat: expose `named_struct` in python [#700](https://github.com/apache/datafusion-python/pull/700) (Michael-J-Ward) + +**Merged pull requests:** + +- Add document about basics of working with expressions [#668](https://github.com/apache/datafusion-python/pull/668) (timsaucer) +- chore: Update Python release process now that DataFusion is TLP [#674](https://github.com/apache/datafusion-python/pull/674) (andygrove) +- Fix Docs [#676](https://github.com/apache/datafusion-python/pull/676) (Michael-J-Ward) +- Add examples from TPC-H [#666](https://github.com/apache/datafusion-python/pull/666) (timsaucer) +- fix conda nightly builds, attempt 2 [#689](https://github.com/apache/datafusion-python/pull/689) (Michael-J-Ward) +- Upgrade to datafusion 38 [#691](https://github.com/apache/datafusion-python/pull/691) (Michael-J-Ward) +- chore: update to maturin's recommended project layout for rust/python… [#695](https://github.com/apache/datafusion-python/pull/695) (Michael-J-Ward) +- chore: update cargo deps [#698](https://github.com/apache/datafusion-python/pull/698) (Michael-J-Ward) +- feat: add python bindings for ends_with function [#693](https://github.com/apache/datafusion-python/pull/693) (richtia) +- feat: expose `named_struct` in python [#700](https://github.com/apache/datafusion-python/pull/700) (Michael-J-Ward) +- Website fixes [#702](https://github.com/apache/datafusion-python/pull/702) (Michael-J-Ward) + +## [37.1.0](https://github.com/apache/datafusion-python/tree/37.1.0) (2024-05-08) + +**Implemented enhancements:** + +- feat: add execute_stream and execute_stream_partitioned [#610](https://github.com/apache/datafusion-python/pull/610) (mesejo) + +**Documentation updates:** + +- docs: update docs CI to install python-311 requirements [#661](https://github.com/apache/datafusion-python/pull/661) (Michael-J-Ward) + +**Merged pull requests:** + +- Switch to Ruff for Python linting [#529](https://github.com/apache/datafusion-python/pull/529) (andygrove) +- Remove sql-on-pandas/polars/cudf examples [#602](https://github.com/apache/datafusion-python/pull/602) (andygrove) +- build(deps): bump object_store from 0.9.0 to 0.9.1 [#611](https://github.com/apache/datafusion-python/pull/611) (dependabot[bot]) +- More missing array funcs [#605](https://github.com/apache/datafusion-python/pull/605) (judahrand) +- feat: add execute_stream and execute_stream_partitioned [#610](https://github.com/apache/datafusion-python/pull/610) (mesejo) +- build(deps): bump uuid from 1.7.0 to 1.8.0 [#615](https://github.com/apache/datafusion-python/pull/615) (dependabot[bot]) +- Bind SQLOptions and relative ctx method #567 [#588](https://github.com/apache/datafusion-python/pull/588) (giacomorebecchi) +- bugfix: no panic on empty table [#613](https://github.com/apache/datafusion-python/pull/613) (mesejo) +- Expose `register_listing_table` [#618](https://github.com/apache/datafusion-python/pull/618) (henrifroese) +- Expose unnest feature [#641](https://github.com/apache/datafusion-python/pull/641) (timsaucer) +- Update domain names and paths in asf yaml [#643](https://github.com/apache/datafusion-python/pull/643) (andygrove) +- use python 3.11 to publish docs [#645](https://github.com/apache/datafusion-python/pull/645) (andygrove) +- docs: update docs CI to install python-311 requirements [#661](https://github.com/apache/datafusion-python/pull/661) (Michael-J-Ward) +- Upgrade Datafusion to v37.1.0 [#669](https://github.com/apache/datafusion-python/pull/669) (Michael-J-Ward) + +## [36.0.0](https://github.com/apache/datafusion-python/tree/36.0.0) (2024-03-02) + +**Implemented enhancements:** + +- feat: Add `flatten` array function [#562](https://github.com/apache/datafusion-python/pull/562) (mobley-trent) + +**Documentation updates:** + +- docs: Add ASF attribution [#580](https://github.com/apache/datafusion-python/pull/580) (simicd) + +**Merged pull requests:** + +- Allow PyDataFrame to be used from other projects [#582](https://github.com/apache/datafusion-python/pull/582) (andygrove) +- docs: Add ASF attribution [#580](https://github.com/apache/datafusion-python/pull/580) (simicd) +- Add array functions [#560](https://github.com/apache/datafusion-python/pull/560) (ongchi) +- feat: Add `flatten` array function [#562](https://github.com/apache/datafusion-python/pull/562) (mobley-trent) + +## [35.0.0](https://github.com/apache/datafusion-python/tree/35.0.0) (2024-01-20) + +**Merged pull requests:** + +- build(deps): bump syn from 2.0.41 to 2.0.43 [#559](https://github.com/apache/datafusion-python/pull/559) (dependabot[bot]) +- build(deps): bump tokio from 1.35.0 to 1.35.1 [#558](https://github.com/apache/datafusion-python/pull/558) (dependabot[bot]) +- build(deps): bump async-trait from 0.1.74 to 0.1.77 [#556](https://github.com/apache/datafusion-python/pull/556) (dependabot[bot]) +- build(deps): bump pyo3 from 0.20.0 to 0.20.2 [#557](https://github.com/apache/datafusion-python/pull/557) (dependabot[bot]) + +## [34.0.0](https://github.com/apache/datafusion-python/tree/34.0.0) (2023-12-28) + +**Merged pull requests:** + +- Adjust visibility of crate private members & Functions [#537](https://github.com/apache/datafusion-python/pull/537) (jdye64) +- Update json.rst [#538](https://github.com/apache/datafusion-python/pull/538) (ray-andrew) +- Enable mimalloc local_dynamic_tls feature [#540](https://github.com/apache/datafusion-python/pull/540) (jdye64) +- Enable substrait feature to be built by default in CI, for nightlies … [#544](https://github.com/apache/datafusion-python/pull/544) (jdye64) + +## [33.0.0](https://github.com/apache/datafusion-python/tree/33.0.0) (2023-11-16) + +**Merged pull requests:** + +- First pass at getting architectured builds working [#350](https://github.com/apache/datafusion-python/pull/350) (charlesbluca) +- Remove libprotobuf dep [#527](https://github.com/apache/datafusion-python/pull/527) (jdye64) + +## [32.0.0](https://github.com/apache/datafusion-python/tree/32.0.0) (2023-10-21) + +**Implemented enhancements:** + +- feat: expose PyWindowFrame [#509](https://github.com/apache/datafusion-python/pull/509) (dlovell) +- add Binary String Functions;encode,decode [#494](https://github.com/apache/datafusion-python/pull/494) (jiangzhx) +- add bit_and,bit_or,bit_xor,bool_add,bool_or [#496](https://github.com/apache/datafusion-python/pull/496) (jiangzhx) +- add first_value last_value [#498](https://github.com/apache/datafusion-python/pull/498) (jiangzhx) +- add regr\_\* functions [#499](https://github.com/apache/datafusion-python/pull/499) (jiangzhx) +- Add random missing bindings [#522](https://github.com/apache/datafusion-python/pull/522) (jdye64) +- Allow for multiple input files per table instead of a single file [#519](https://github.com/apache/datafusion-python/pull/519) (jdye64) +- Add support for window function bindings [#521](https://github.com/apache/datafusion-python/pull/521) (jdye64) + +**Merged pull requests:** + +- Prepare 31.0.0 release [#500](https://github.com/apache/datafusion-python/pull/500) (andygrove) +- Improve release process documentation [#505](https://github.com/apache/datafusion-python/pull/505) (andygrove) +- add Binary String Functions;encode,decode [#494](https://github.com/apache/datafusion-python/pull/494) (jiangzhx) +- build(deps): bump mimalloc from 0.1.38 to 0.1.39 [#502](https://github.com/apache/datafusion-python/pull/502) (dependabot[bot]) +- build(deps): bump syn from 2.0.32 to 2.0.35 [#503](https://github.com/apache/datafusion-python/pull/503) (dependabot[bot]) +- build(deps): bump syn from 2.0.35 to 2.0.37 [#506](https://github.com/apache/datafusion-python/pull/506) (dependabot[bot]) +- Use latest DataFusion [#511](https://github.com/apache/datafusion-python/pull/511) (andygrove) +- add bit_and,bit_or,bit_xor,bool_add,bool_or [#496](https://github.com/apache/datafusion-python/pull/496) (jiangzhx) +- use DataFusion 32 [#515](https://github.com/apache/datafusion-python/pull/515) (andygrove) +- add first_value last_value [#498](https://github.com/apache/datafusion-python/pull/498) (jiangzhx) +- build(deps): bump regex-syntax from 0.7.5 to 0.8.1 [#517](https://github.com/apache/datafusion-python/pull/517) (dependabot[bot]) +- build(deps): bump pyo3-build-config from 0.19.2 to 0.20.0 [#516](https://github.com/apache/datafusion-python/pull/516) (dependabot[bot]) +- add regr\_\* functions [#499](https://github.com/apache/datafusion-python/pull/499) (jiangzhx) +- Add random missing bindings [#522](https://github.com/apache/datafusion-python/pull/522) (jdye64) +- build(deps): bump rustix from 0.38.18 to 0.38.19 [#523](https://github.com/apache/datafusion-python/pull/523) (dependabot[bot]) +- Allow for multiple input files per table instead of a single file [#519](https://github.com/apache/datafusion-python/pull/519) (jdye64) +- Add support for window function bindings [#521](https://github.com/apache/datafusion-python/pull/521) (jdye64) +- Small clippy fix [#524](https://github.com/apache/datafusion-python/pull/524) (andygrove) + +## [31.0.0](https://github.com/apache/datafusion-python/tree/31.0.0) (2023-09-12) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/28.0.0...31.0.0) + +**Implemented enhancements:** + +- feat: add case function (#447) [#448](https://github.com/apache/datafusion-python/pull/448) (mesejo) +- feat: add compression options [#456](https://github.com/apache/datafusion-python/pull/456) (mesejo) +- feat: add register_json [#458](https://github.com/apache/datafusion-python/pull/458) (mesejo) +- feat: add basic compression configuration to write_parquet [#459](https://github.com/apache/datafusion-python/pull/459) (mesejo) +- feat: add example of reading parquet from s3 [#460](https://github.com/apache/datafusion-python/pull/460) (mesejo) +- feat: add register_avro and read_table [#461](https://github.com/apache/datafusion-python/pull/461) (mesejo) +- feat: add missing scalar math functions [#465](https://github.com/apache/datafusion-python/pull/465) (mesejo) + +**Documentation updates:** + +- docs: include pre-commit hooks section in contributor guide [#455](https://github.com/apache/datafusion-python/pull/455) (mesejo) + +**Merged pull requests:** + +- Build Linux aarch64 wheel [#443](https://github.com/apache/datafusion-python/pull/443) (gokselk) +- feat: add case function (#447) [#448](https://github.com/apache/datafusion-python/pull/448) (mesejo) +- enhancement(docs): Add user guide (#432) [#445](https://github.com/apache/datafusion-python/pull/445) (mesejo) +- docs: include pre-commit hooks section in contributor guide [#455](https://github.com/apache/datafusion-python/pull/455) (mesejo) +- feat: add compression options [#456](https://github.com/apache/datafusion-python/pull/456) (mesejo) +- Upgrade to DF 28.0.0-rc1 [#457](https://github.com/apache/datafusion-python/pull/457) (andygrove) +- feat: add register_json [#458](https://github.com/apache/datafusion-python/pull/458) (mesejo) +- feat: add basic compression configuration to write_parquet [#459](https://github.com/apache/datafusion-python/pull/459) (mesejo) +- feat: add example of reading parquet from s3 [#460](https://github.com/apache/datafusion-python/pull/460) (mesejo) +- feat: add register_avro and read_table [#461](https://github.com/apache/datafusion-python/pull/461) (mesejo) +- feat: add missing scalar math functions [#465](https://github.com/apache/datafusion-python/pull/465) (mesejo) +- build(deps): bump arduino/setup-protoc from 1 to 2 [#452](https://github.com/apache/datafusion-python/pull/452) (dependabot[bot]) +- Revert "build(deps): bump arduino/setup-protoc from 1 to 2 (#452)" [#474](https://github.com/apache/datafusion-python/pull/474) (viirya) +- Minor: fix wrongly copied function description [#497](https://github.com/apache/datafusion-python/pull/497) (viirya) +- Upgrade to Datafusion 31.0.0 [#491](https://github.com/apache/datafusion-python/pull/491) (judahrand) +- Add `isnan` and `iszero` [#495](https://github.com/apache/datafusion-python/pull/495) (judahrand) + +## 30.0.0 + +- Skipped due to a breaking change in DataFusion + +## 29.0.0 + +- Skipped + +## [28.0.0](https://github.com/apache/datafusion-python/tree/28.0.0) (2023-07-25) + +**Implemented enhancements:** + +- feat: expose offset in python API [#437](https://github.com/apache/datafusion-python/pull/437) (cpcloud) + +**Merged pull requests:** + +- File based input utils [#433](https://github.com/apache/datafusion-python/pull/433) (jdye64) +- Upgrade to 28.0.0-rc1 [#434](https://github.com/apache/datafusion-python/pull/434) (andygrove) +- Introduces utility for obtaining SqlTable information from a file like location [#398](https://github.com/apache/datafusion-python/pull/398) (jdye64) +- feat: expose offset in python API [#437](https://github.com/apache/datafusion-python/pull/437) (cpcloud) +- Use DataFusion 28 [#439](https://github.com/apache/datafusion-python/pull/439) (andygrove) + +## [27.0.0](https://github.com/apache/datafusion-python/tree/27.0.0) (2023-07-03) + +**Merged pull requests:** + +- LogicalPlan.to_variant() make public [#412](https://github.com/apache/datafusion-python/pull/412) (jdye64) +- Prepare 27.0.0 release [#423](https://github.com/apache/datafusion-python/pull/423) (andygrove) + +## [26.0.0](https://github.com/apache/datafusion-python/tree/26.0.0) (2023-06-11) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/25.0.0...26.0.0) + +**Merged pull requests:** + +- Add Expr::Case when_then_else support to rex_call_operands function [#388](https://github.com/apache/datafusion-python/pull/388) (jdye64) +- Introduce BaseSessionContext abstract class [#390](https://github.com/apache/datafusion-python/pull/390) (jdye64) +- CRUD Schema support for `BaseSessionContext` [#392](https://github.com/apache/datafusion-python/pull/392) (jdye64) +- CRUD Table support for `BaseSessionContext` [#394](https://github.com/apache/datafusion-python/pull/394) (jdye64) + +## [25.0.0](https://github.com/apache/datafusion-python/tree/25.0.0) (2023-05-23) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/24.0.0...25.0.0) + +**Merged pull requests:** + +- Prepare 24.0.0 Release [#376](https://github.com/apache/datafusion-python/pull/376) (andygrove) +- build(deps): bump uuid from 1.3.1 to 1.3.2 [#359](https://github.com/apache/datafusion-python/pull/359) (dependabot[bot]) +- build(deps): bump mimalloc from 0.1.36 to 0.1.37 [#361](https://github.com/apache/datafusion-python/pull/361) (dependabot[bot]) +- build(deps): bump regex-syntax from 0.6.29 to 0.7.1 [#334](https://github.com/apache/datafusion-python/pull/334) (dependabot[bot]) +- upgrade maturin to 0.15.1 [#379](https://github.com/apache/datafusion-python/pull/379) (Jimexist) +- Expand Expr to include RexType basic support [#378](https://github.com/apache/datafusion-python/pull/378) (jdye64) +- Add Python script for generating changelog [#383](https://github.com/apache/datafusion-python/pull/383) (andygrove) + +## [24.0.0](https://github.com/apache/datafusion-python/tree/24.0.0) (2023-05-09) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/23.0.0...24.0.0) + +**Documentation updates:** + +- Fix link to user guide [#354](https://github.com/apache/datafusion-python/pull/354) (andygrove) + +**Merged pull requests:** + +- Add interface to serialize Substrait plans to Python Bytes. [#344](https://github.com/apache/datafusion-python/pull/344) (kylebrooks-8451) +- Add partition_count property to ExecutionPlan. [#346](https://github.com/apache/datafusion-python/pull/346) (kylebrooks-8451) +- Remove unsendable from all Rust pyclass types. [#348](https://github.com/apache/datafusion-python/pull/348) (kylebrooks-8451) +- Fix link to user guide [#354](https://github.com/apache/datafusion-python/pull/354) (andygrove) +- Fix SessionContext execute. [#353](https://github.com/apache/datafusion-python/pull/353) (kylebrooks-8451) +- Pub mod expr in lib.rs [#357](https://github.com/apache/datafusion-python/pull/357) (jdye64) +- Add benchmark derived from TPC-H [#355](https://github.com/apache/datafusion-python/pull/355) (andygrove) +- Add db-benchmark [#365](https://github.com/apache/datafusion-python/pull/365) (andygrove) +- First pass of documentation in mdBook [#364](https://github.com/apache/datafusion-python/pull/364) (MrPowers) +- Add 'pub' and '#[pyo3(get, set)]' to DataTypeMap [#371](https://github.com/apache/datafusion-python/pull/371) (jdye64) +- Fix db-benchmark [#369](https://github.com/apache/datafusion-python/pull/369) (andygrove) +- Docs explaining how to view query plans [#373](https://github.com/apache/datafusion-python/pull/373) (andygrove) +- Improve db-benchmark [#372](https://github.com/apache/datafusion-python/pull/372) (andygrove) +- Make expr member of PyExpr public [#375](https://github.com/apache/datafusion-python/pull/375) (jdye64) + +## [23.0.0](https://github.com/apache/datafusion-python/tree/23.0.0) (2023-04-23) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/22.0.0...23.0.0) + +**Merged pull requests:** + +- Improve API docs, README, and examples for configuring context [#321](https://github.com/apache/datafusion-python/pull/321) (andygrove) +- Osx build linker args [#330](https://github.com/apache/datafusion-python/pull/330) (jdye64) +- Add requirements file for python 3.11 [#332](https://github.com/apache/datafusion-python/pull/332) (r4ntix) +- mac arm64 build [#338](https://github.com/apache/datafusion-python/pull/338) (andygrove) +- Add conda.yaml baseline workflow file [#281](https://github.com/apache/datafusion-python/pull/281) (jdye64) +- Prepare for 23.0.0 release [#335](https://github.com/apache/datafusion-python/pull/335) (andygrove) +- Reuse the Tokio Runtime [#341](https://github.com/apache/datafusion-python/pull/341) (kylebrooks-8451) + +## [22.0.0](https://github.com/apache/datafusion-python/tree/22.0.0) (2023-04-10) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/21.0.0...22.0.0) + +**Merged pull requests:** + +- Fix invalid build yaml [#308](https://github.com/apache/datafusion-python/pull/308) (andygrove) +- Try fix release build [#309](https://github.com/apache/datafusion-python/pull/309) (andygrove) +- Fix release build [#310](https://github.com/apache/datafusion-python/pull/310) (andygrove) +- Enable datafusion-substrait protoc feature, to remove compile-time dependency on protoc [#312](https://github.com/apache/datafusion-python/pull/312) (andygrove) +- Fix Mac/Win release builds in CI [#313](https://github.com/apache/datafusion-python/pull/313) (andygrove) +- install protoc in docs workflow [#314](https://github.com/apache/datafusion-python/pull/314) (andygrove) +- Fix documentation generation in CI [#315](https://github.com/apache/datafusion-python/pull/315) (andygrove) +- Source wheel fix [#319](https://github.com/apache/datafusion-python/pull/319) (andygrove) + +## [21.0.0](https://github.com/apache/datafusion-python/tree/21.0.0) (2023-03-30) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/20.0.0...21.0.0) + +**Merged pull requests:** + +- minor: Fix minor warning on unused import [#289](https://github.com/apache/datafusion-python/pull/289) (viirya) +- feature: Implement `describe()` method [#293](https://github.com/apache/datafusion-python/pull/293) (simicd) +- fix: Printed results not visible in debugger & notebooks [#296](https://github.com/apache/datafusion-python/pull/296) (simicd) +- add package.include and remove wildcard dependency [#295](https://github.com/apache/datafusion-python/pull/295) (andygrove) +- Update main branch name in docs workflow [#303](https://github.com/apache/datafusion-python/pull/303) (andygrove) +- Upgrade to DF 21 [#301](https://github.com/apache/datafusion-python/pull/301) (andygrove) + +## [20.0.0](https://github.com/apache/datafusion-python/tree/20.0.0) (2023-03-17) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/0.8.0...20.0.0) + +**Implemented enhancements:** + +- Empty relation bindings [#208](https://github.com/apache/datafusion-python/pull/208) (jdye64) +- wrap display_name and canonical_name functions [#214](https://github.com/apache/datafusion-python/pull/214) (jdye64) +- Add PyAlias bindings [#216](https://github.com/apache/datafusion-python/pull/216) (jdye64) +- Add bindings for scalar_variable [#218](https://github.com/apache/datafusion-python/pull/218) (jdye64) +- Bindings for LIKE type expressions [#220](https://github.com/apache/datafusion-python/pull/220) (jdye64) +- Bool expr bindings [#223](https://github.com/apache/datafusion-python/pull/223) (jdye64) +- Between bindings [#229](https://github.com/apache/datafusion-python/pull/229) (jdye64) +- Add bindings for GetIndexedField [#227](https://github.com/apache/datafusion-python/pull/227) (jdye64) +- Add bindings for case, cast, and trycast [#232](https://github.com/apache/datafusion-python/pull/232) (jdye64) +- add remaining expr bindings [#233](https://github.com/apache/datafusion-python/pull/233) (jdye64) +- feature: Additional export methods [#236](https://github.com/apache/datafusion-python/pull/236) (simicd) +- Add Python wrapper for LogicalPlan::Union [#240](https://github.com/apache/datafusion-python/pull/240) (iajoiner) +- feature: Create dataframe from pandas, polars, dictionary, list or pyarrow Table [#242](https://github.com/apache/datafusion-python/pull/242) (simicd) +- Add Python wrappers for `LogicalPlan::Join` and `LogicalPlan::CrossJoin` [#246](https://github.com/apache/datafusion-python/pull/246) (iajoiner) +- feature: Set table name from ctx functions [#260](https://github.com/apache/datafusion-python/pull/260) (simicd) +- Explain bindings [#264](https://github.com/apache/datafusion-python/pull/264) (jdye64) +- Extension bindings [#266](https://github.com/apache/datafusion-python/pull/266) (jdye64) +- Subquery alias bindings [#269](https://github.com/apache/datafusion-python/pull/269) (jdye64) +- Create memory table [#271](https://github.com/apache/datafusion-python/pull/271) (jdye64) +- Create view bindings [#273](https://github.com/apache/datafusion-python/pull/273) (jdye64) +- Re-export Datafusion dependencies [#277](https://github.com/apache/datafusion-python/pull/277) (jdye64) +- Distinct bindings [#275](https://github.com/apache/datafusion-python/pull/275) (jdye64) +- Drop table bindings [#283](https://github.com/apache/datafusion-python/pull/283) (jdye64) +- Bindings for LogicalPlan::Repartition [#285](https://github.com/apache/datafusion-python/pull/285) (jdye64) +- Expand Rust return type support for Arrow DataTypes in ScalarValue [#287](https://github.com/apache/datafusion-python/pull/287) (jdye64) + +**Documentation updates:** + +- docs: Example of calling Python UDF & UDAF in SQL [#258](https://github.com/apache/datafusion-python/pull/258) (simicd) + +**Merged pull requests:** + +- Minor docs updates [#210](https://github.com/apache/datafusion-python/pull/210) (andygrove) +- Empty relation bindings [#208](https://github.com/apache/datafusion-python/pull/208) (jdye64) +- wrap display_name and canonical_name functions [#214](https://github.com/apache/datafusion-python/pull/214) (jdye64) +- Add PyAlias bindings [#216](https://github.com/apache/datafusion-python/pull/216) (jdye64) +- Add bindings for scalar_variable [#218](https://github.com/apache/datafusion-python/pull/218) (jdye64) +- Bindings for LIKE type expressions [#220](https://github.com/apache/datafusion-python/pull/220) (jdye64) +- Bool expr bindings [#223](https://github.com/apache/datafusion-python/pull/223) (jdye64) +- Between bindings [#229](https://github.com/apache/datafusion-python/pull/229) (jdye64) +- Add bindings for GetIndexedField [#227](https://github.com/apache/datafusion-python/pull/227) (jdye64) +- Add bindings for case, cast, and trycast [#232](https://github.com/apache/datafusion-python/pull/232) (jdye64) +- add remaining expr bindings [#233](https://github.com/apache/datafusion-python/pull/233) (jdye64) +- Pre-commit hooks [#228](https://github.com/apache/datafusion-python/pull/228) (jdye64) +- Implement new release process [#149](https://github.com/apache/datafusion-python/pull/149) (andygrove) +- feature: Additional export methods [#236](https://github.com/apache/datafusion-python/pull/236) (simicd) +- Add Python wrapper for LogicalPlan::Union [#240](https://github.com/apache/datafusion-python/pull/240) (iajoiner) +- feature: Create dataframe from pandas, polars, dictionary, list or pyarrow Table [#242](https://github.com/apache/datafusion-python/pull/242) (simicd) +- Fix release instructions [#238](https://github.com/apache/datafusion-python/pull/238) (andygrove) +- Add Python wrappers for `LogicalPlan::Join` and `LogicalPlan::CrossJoin` [#246](https://github.com/apache/datafusion-python/pull/246) (iajoiner) +- docs: Example of calling Python UDF & UDAF in SQL [#258](https://github.com/apache/datafusion-python/pull/258) (simicd) +- feature: Set table name from ctx functions [#260](https://github.com/apache/datafusion-python/pull/260) (simicd) +- Upgrade to DataFusion 19 [#262](https://github.com/apache/datafusion-python/pull/262) (andygrove) +- Explain bindings [#264](https://github.com/apache/datafusion-python/pull/264) (jdye64) +- Extension bindings [#266](https://github.com/apache/datafusion-python/pull/266) (jdye64) +- Subquery alias bindings [#269](https://github.com/apache/datafusion-python/pull/269) (jdye64) +- Create memory table [#271](https://github.com/apache/datafusion-python/pull/271) (jdye64) +- Create view bindings [#273](https://github.com/apache/datafusion-python/pull/273) (jdye64) +- Re-export Datafusion dependencies [#277](https://github.com/apache/datafusion-python/pull/277) (jdye64) +- Distinct bindings [#275](https://github.com/apache/datafusion-python/pull/275) (jdye64) +- build(deps): bump actions/checkout from 2 to 3 [#244](https://github.com/apache/datafusion-python/pull/244) (dependabot[bot]) +- build(deps): bump actions/upload-artifact from 2 to 3 [#245](https://github.com/apache/datafusion-python/pull/245) (dependabot[bot]) +- build(deps): bump actions/download-artifact from 2 to 3 [#243](https://github.com/apache/datafusion-python/pull/243) (dependabot[bot]) +- Use DataFusion 20 [#278](https://github.com/apache/datafusion-python/pull/278) (andygrove) +- Drop table bindings [#283](https://github.com/apache/datafusion-python/pull/283) (jdye64) +- Bindings for LogicalPlan::Repartition [#285](https://github.com/apache/datafusion-python/pull/285) (jdye64) +- Expand Rust return type support for Arrow DataTypes in ScalarValue [#287](https://github.com/apache/datafusion-python/pull/287) (jdye64) + +## [0.8.0](https://github.com/apache/datafusion-python/tree/0.8.0) (2023-02-22) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/0.8.0-rc1...0.8.0) + +**Implemented enhancements:** + +- Add support for cuDF physical execution engine [\#202](https://github.com/apache/datafusion-python/issues/202) +- Make it easier to create a Pandas dataframe from DataFusion query results [\#139](https://github.com/apache/datafusion-python/issues/139) + +**Fixed bugs:** + +- Build error: could not compile `thiserror` due to 2 previous errors [\#69](https://github.com/apache/datafusion-python/issues/69) + +**Closed issues:** + +- Integrate with the new `object_store` crate [\#22](https://github.com/apache/datafusion-python/issues/22) + +**Merged pull requests:** + +- Update README in preparation for 0.8 release [\#206](https://github.com/apache/datafusion-python/pull/206) ([andygrove](https://github.com/andygrove)) +- Add support for cudf as a physical execution engine [\#205](https://github.com/apache/datafusion-python/pull/205) ([jdye64](https://github.com/jdye64)) +- Run `maturin develop` instead of `cargo build` in verification script [\#200](https://github.com/apache/datafusion-python/pull/200) ([andygrove](https://github.com/andygrove)) +- Add tests for recently added functionality [\#199](https://github.com/apache/datafusion-python/pull/199) ([andygrove](https://github.com/andygrove)) +- Implement `to_pandas()` [\#197](https://github.com/apache/datafusion-python/pull/197) ([simicd](https://github.com/simicd)) +- Add Python wrapper for LogicalPlan::Sort [\#196](https://github.com/apache/datafusion-python/pull/196) ([andygrove](https://github.com/andygrove)) +- Add Python wrapper for LogicalPlan::Aggregate [\#195](https://github.com/apache/datafusion-python/pull/195) ([andygrove](https://github.com/andygrove)) +- Add Python wrapper for LogicalPlan::Limit [\#193](https://github.com/apache/datafusion-python/pull/193) ([andygrove](https://github.com/andygrove)) +- Add Python wrapper for LogicalPlan::Filter [\#192](https://github.com/apache/datafusion-python/pull/192) ([andygrove](https://github.com/andygrove)) +- Add experimental support for executing SQL with Polars and Pandas [\#190](https://github.com/apache/datafusion-python/pull/190) ([andygrove](https://github.com/andygrove)) +- Update changelog for 0.8 release [\#188](https://github.com/apache/datafusion-python/pull/188) ([andygrove](https://github.com/andygrove)) +- Add ability to execute ExecutionPlan and get a stream of RecordBatch [\#186](https://github.com/apache/datafusion-python/pull/186) ([andygrove](https://github.com/andygrove)) +- Dffield bindings [\#185](https://github.com/apache/datafusion-python/pull/185) ([jdye64](https://github.com/jdye64)) +- Add bindings for DFSchema [\#183](https://github.com/apache/datafusion-python/pull/183) ([jdye64](https://github.com/jdye64)) +- test: Window functions [\#182](https://github.com/apache/datafusion-python/pull/182) ([simicd](https://github.com/simicd)) +- Add bindings for Projection [\#180](https://github.com/apache/datafusion-python/pull/180) ([jdye64](https://github.com/jdye64)) +- Table scan bindings [\#178](https://github.com/apache/datafusion-python/pull/178) ([jdye64](https://github.com/jdye64)) +- Make session configurable [\#176](https://github.com/apache/datafusion-python/pull/176) ([andygrove](https://github.com/andygrove)) +- Upgrade to DataFusion 18.0.0 [\#175](https://github.com/apache/datafusion-python/pull/175) ([andygrove](https://github.com/andygrove)) +- Use latest DataFusion rev in preparation for DF 18 release [\#174](https://github.com/apache/datafusion-python/pull/174) ([andygrove](https://github.com/andygrove)) +- Arrow type bindings [\#173](https://github.com/apache/datafusion-python/pull/173) ([jdye64](https://github.com/jdye64)) +- Pyo3 bump [\#171](https://github.com/apache/datafusion-python/pull/171) ([jdye64](https://github.com/jdye64)) +- feature: Add additional aggregation functions [\#170](https://github.com/apache/datafusion-python/pull/170) ([simicd](https://github.com/simicd)) +- Make from_substrait_plan return DataFrame instead of LogicalPlan [\#164](https://github.com/apache/datafusion-python/pull/164) ([andygrove](https://github.com/andygrove)) +- feature: Implement count method [\#163](https://github.com/apache/datafusion-python/pull/163) ([simicd](https://github.com/simicd)) +- CI Fixes [\#162](https://github.com/apache/datafusion-python/pull/162) ([jdye64](https://github.com/jdye64)) +- Upgrade to DataFusion 17 [\#160](https://github.com/apache/datafusion-python/pull/160) ([andygrove](https://github.com/andygrove)) +- feature: Improve string representation of datafusion classes [\#159](https://github.com/apache/datafusion-python/pull/159) ([simicd](https://github.com/simicd)) +- Make PyExecutionPlan.plan public [\#156](https://github.com/apache/datafusion-python/pull/156) ([andygrove](https://github.com/andygrove)) +- Expose methods on logical and execution plans [\#155](https://github.com/apache/datafusion-python/pull/155) ([andygrove](https://github.com/andygrove)) +- Fix clippy for new Rust version [\#154](https://github.com/apache/datafusion-python/pull/154) ([andygrove](https://github.com/andygrove)) +- Add DataFrame methods for accessing plans [\#153](https://github.com/apache/datafusion-python/pull/153) ([andygrove](https://github.com/andygrove)) +- Use DataFusion rev 5238e8c97f998b4d2cb9fab85fb182f325a1a7fb [\#150](https://github.com/apache/datafusion-python/pull/150) ([andygrove](https://github.com/andygrove)) +- build\(deps\): bump async-trait from 0.1.61 to 0.1.62 [\#148](https://github.com/apache/datafusion-python/pull/148) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Rename default branch from master to main [\#147](https://github.com/apache/datafusion-python/pull/147) ([andygrove](https://github.com/andygrove)) +- Substrait bindings [\#145](https://github.com/apache/datafusion-python/pull/145) ([jdye64](https://github.com/jdye64)) +- build\(deps\): bump uuid from 0.8.2 to 1.2.2 [\#143](https://github.com/apache/datafusion-python/pull/143) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Prepare for 0.8.0 release [\#141](https://github.com/apache/datafusion-python/pull/141) ([andygrove](https://github.com/andygrove)) +- Improve README and add more examples [\#137](https://github.com/apache/datafusion-python/pull/137) ([andygrove](https://github.com/andygrove)) +- test: Expand tests for built-in functions [\#129](https://github.com/apache/datafusion-python/pull/129) ([simicd](https://github.com/simicd)) +- build\(deps\): bump object_store from 0.5.2 to 0.5.3 [\#126](https://github.com/apache/datafusion-python/pull/126) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump mimalloc from 0.1.32 to 0.1.34 [\#125](https://github.com/apache/datafusion-python/pull/125) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Introduce conda directory containing datafusion-dev.yaml conda enviro… [\#124](https://github.com/apache/datafusion-python/pull/124) ([jdye64](https://github.com/jdye64)) +- build\(deps\): bump bzip2 from 0.4.3 to 0.4.4 [\#121](https://github.com/apache/datafusion-python/pull/121) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump tokio from 1.23.0 to 1.24.1 [\#119](https://github.com/apache/datafusion-python/pull/119) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump async-trait from 0.1.60 to 0.1.61 [\#118](https://github.com/apache/datafusion-python/pull/118) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Upgrade to DataFusion 16.0.0 [\#115](https://github.com/apache/datafusion-python/pull/115) ([andygrove](https://github.com/andygrove)) +- Bump async-trait from 0.1.57 to 0.1.60 [\#114](https://github.com/apache/datafusion-python/pull/114) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump object_store from 0.5.1 to 0.5.2 [\#112](https://github.com/apache/datafusion-python/pull/112) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump tokio from 1.21.2 to 1.23.0 [\#109](https://github.com/apache/datafusion-python/pull/109) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add entries for publishing production \(asf-site\) and staging docs [\#107](https://github.com/apache/datafusion-python/pull/107) ([martin-g](https://github.com/martin-g)) +- Add a workflow that builds the docs and deploys them at staged or production [\#104](https://github.com/apache/datafusion-python/pull/104) ([martin-g](https://github.com/martin-g)) +- Upgrade to DataFusion 15.0.0 [\#103](https://github.com/apache/datafusion-python/pull/103) ([andygrove](https://github.com/andygrove)) +- build\(deps\): bump futures from 0.3.24 to 0.3.25 [\#102](https://github.com/apache/datafusion-python/pull/102) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump pyo3 from 0.17.2 to 0.17.3 [\#101](https://github.com/apache/datafusion-python/pull/101) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump mimalloc from 0.1.30 to 0.1.32 [\#98](https://github.com/apache/datafusion-python/pull/98) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump rand from 0.7.3 to 0.8.5 [\#97](https://github.com/apache/datafusion-python/pull/97) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix GitHub actions warnings [\#95](https://github.com/apache/datafusion-python/pull/95) ([martin-g](https://github.com/martin-g)) +- Fixes \#81 - Add CI workflow for source distribution [\#93](https://github.com/apache/datafusion-python/pull/93) ([martin-g](https://github.com/martin-g)) +- post-release updates [\#91](https://github.com/apache/datafusion-python/pull/91) ([andygrove](https://github.com/andygrove)) +- Build for manylinux 2014 [\#88](https://github.com/apache/datafusion-python/pull/88) ([martin-g](https://github.com/martin-g)) +- update release readme tag [\#86](https://github.com/apache/datafusion-python/pull/86) ([Jimexist](https://github.com/Jimexist)) +- Upgrade Maturin to 0.14.2 [\#85](https://github.com/apache/datafusion-python/pull/85) ([martin-g](https://github.com/martin-g)) +- Update release instructions [\#83](https://github.com/apache/datafusion-python/pull/83) ([andygrove](https://github.com/andygrove)) +- \[Functions\] - Add python function binding to `functions` [\#73](https://github.com/apache/datafusion-python/pull/73) ([francis-du](https://github.com/francis-du)) + +## [0.8.0-rc1](https://github.com/apache/datafusion-python/tree/0.8.0-rc1) (2023-02-17) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/0.7.0-rc2...0.8.0-rc1) + +**Implemented enhancements:** + +- Add bindings for datafusion_common::DFField [\#184](https://github.com/apache/datafusion-python/issues/184) +- Add bindings for DFSchema/DFSchemaRef [\#181](https://github.com/apache/datafusion-python/issues/181) +- Add bindings for datafusion_expr Projection [\#179](https://github.com/apache/datafusion-python/issues/179) +- Add bindings for `TableScan` struct from `datafusion_expr::TableScan` [\#177](https://github.com/apache/datafusion-python/issues/177) +- Add a "mapping" struct for types [\#172](https://github.com/apache/datafusion-python/issues/172) +- Improve string representation of datafusion classes \(dataframe, context, expression, ...\) [\#158](https://github.com/apache/datafusion-python/issues/158) +- Add DataFrame count method [\#151](https://github.com/apache/datafusion-python/issues/151) +- \[REQUEST\] Github Actions Improvements [\#146](https://github.com/apache/datafusion-python/issues/146) +- Change default branch name from master to main [\#144](https://github.com/apache/datafusion-python/issues/144) +- Bump pyo3 to 0.18.0 [\#140](https://github.com/apache/datafusion-python/issues/140) +- Add script for Python linting [\#134](https://github.com/apache/datafusion-python/issues/134) +- Add Python bindings for substrait module [\#132](https://github.com/apache/datafusion-python/issues/132) +- Expand unit tests for built-in functions [\#128](https://github.com/apache/datafusion-python/issues/128) +- support creating arrow-datafusion-python conda environment [\#122](https://github.com/apache/datafusion-python/issues/122) +- Build Python source distribution in GitHub workflow [\#81](https://github.com/apache/datafusion-python/issues/81) +- EPIC: Add all functions to python binding `functions` [\#72](https://github.com/apache/datafusion-python/issues/72) + +**Fixed bugs:** + +- Build is broken [\#161](https://github.com/apache/datafusion-python/issues/161) +- Out of memory when sorting [\#157](https://github.com/apache/datafusion-python/issues/157) +- window_lead test appears to be non-deterministic [\#135](https://github.com/apache/datafusion-python/issues/135) +- Reading csv does not work [\#130](https://github.com/apache/datafusion-python/issues/130) +- Github actions produce a lot of warnings [\#94](https://github.com/apache/datafusion-python/issues/94) +- ASF source release tarball has wrong directory name [\#90](https://github.com/apache/datafusion-python/issues/90) +- Python Release Build failing after upgrading to maturin 14.2 [\#87](https://github.com/apache/datafusion-python/issues/87) +- Maturin build hangs on Linux ARM64 [\#84](https://github.com/apache/datafusion-python/issues/84) +- Cannot install on Mac M1 from source tarball from testpypi [\#82](https://github.com/apache/datafusion-python/issues/82) +- ImportPathMismatchError when running pytest locally [\#77](https://github.com/apache/datafusion-python/issues/77) + +**Closed issues:** + +- Publish documentation for Python bindings [\#39](https://github.com/apache/datafusion-python/issues/39) +- Add Python binding for `approx_median` [\#32](https://github.com/apache/datafusion-python/issues/32) +- Release version 0.7.0 [\#7](https://github.com/apache/datafusion-python/issues/7) + +## [0.7.0-rc2](https://github.com/apache/datafusion-python/tree/0.7.0-rc2) (2022-11-26) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/0.7.0...0.7.0-rc2) + +## [Unreleased](https://github.com/datafusion-contrib/datafusion-python/tree/HEAD) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1...HEAD) + +**Merged pull requests:** + +- use \_\_getitem\_\_ for df column selection [\#41](https://github.com/datafusion-contrib/datafusion-python/pull/41) ([Jimexist](https://github.com/Jimexist)) +- fix demo in readme [\#40](https://github.com/datafusion-contrib/datafusion-python/pull/40) ([Jimexist](https://github.com/Jimexist)) +- Implement select_columns [\#39](https://github.com/datafusion-contrib/datafusion-python/pull/39) ([andygrove](https://github.com/andygrove)) +- update readme and changelog [\#38](https://github.com/datafusion-contrib/datafusion-python/pull/38) ([Jimexist](https://github.com/Jimexist)) +- Add PyDataFrame.explain [\#36](https://github.com/datafusion-contrib/datafusion-python/pull/36) ([andygrove](https://github.com/andygrove)) +- Release 0.5.0 [\#34](https://github.com/datafusion-contrib/datafusion-python/pull/34) ([Jimexist](https://github.com/Jimexist)) +- disable nightly in workflow [\#33](https://github.com/datafusion-contrib/datafusion-python/pull/33) ([Jimexist](https://github.com/Jimexist)) +- update requirements to 37 and 310, update readme [\#32](https://github.com/datafusion-contrib/datafusion-python/pull/32) ([Jimexist](https://github.com/Jimexist)) +- Add custom global allocator [\#30](https://github.com/datafusion-contrib/datafusion-python/pull/30) ([matthewmturner](https://github.com/matthewmturner)) +- Remove pandas dependency [\#25](https://github.com/datafusion-contrib/datafusion-python/pull/25) ([matthewmturner](https://github.com/matthewmturner)) +- upgrade datafusion and pyo3 [\#20](https://github.com/datafusion-contrib/datafusion-python/pull/20) ([Jimexist](https://github.com/Jimexist)) +- update maturin 0.12+ [\#17](https://github.com/datafusion-contrib/datafusion-python/pull/17) ([Jimexist](https://github.com/Jimexist)) +- Update README.md [\#16](https://github.com/datafusion-contrib/datafusion-python/pull/16) ([Jimexist](https://github.com/Jimexist)) +- apply cargo clippy --fix [\#15](https://github.com/datafusion-contrib/datafusion-python/pull/15) ([Jimexist](https://github.com/Jimexist)) +- update test workflow to include rust clippy and check [\#14](https://github.com/datafusion-contrib/datafusion-python/pull/14) ([Jimexist](https://github.com/Jimexist)) +- use maturin 0.12.6 [\#13](https://github.com/datafusion-contrib/datafusion-python/pull/13) ([Jimexist](https://github.com/Jimexist)) +- apply cargo fmt [\#12](https://github.com/datafusion-contrib/datafusion-python/pull/12) ([Jimexist](https://github.com/Jimexist)) +- use stable not nightly [\#11](https://github.com/datafusion-contrib/datafusion-python/pull/11) ([Jimexist](https://github.com/Jimexist)) +- ci: test against more compilers, setup clippy and fix clippy lints [\#9](https://github.com/datafusion-contrib/datafusion-python/pull/9) ([cpcloud](https://github.com/cpcloud)) +- Fix use of importlib.metadata and unify requirements.txt [\#8](https://github.com/datafusion-contrib/datafusion-python/pull/8) ([cpcloud](https://github.com/cpcloud)) +- Ship the Cargo.lock file in the source distribution [\#7](https://github.com/datafusion-contrib/datafusion-python/pull/7) ([cpcloud](https://github.com/cpcloud)) +- add \_\_version\_\_ attribute to datafusion object [\#3](https://github.com/datafusion-contrib/datafusion-python/pull/3) ([tfeda](https://github.com/tfeda)) +- fix ci by fixing directories [\#2](https://github.com/datafusion-contrib/datafusion-python/pull/2) ([Jimexist](https://github.com/Jimexist)) +- setup workflow [\#1](https://github.com/datafusion-contrib/datafusion-python/pull/1) ([Jimexist](https://github.com/Jimexist)) + +## [0.5.1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1) (2022-03-15) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1-rc1...0.5.1) + +## [0.5.1-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1-rc1) (2022-03-15) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0...0.5.1-rc1) + +## [0.5.0](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0) (2022-03-10) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc2...0.5.0) + +## [0.5.0-rc2](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc2) (2022-03-10) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc1...0.5.0-rc2) + +**Closed issues:** + +- Add support for Ballista [\#37](https://github.com/datafusion-contrib/datafusion-python/issues/37) +- Implement DataFrame.explain [\#35](https://github.com/datafusion-contrib/datafusion-python/issues/35) + +## [0.5.0-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc1) (2022-03-09) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/4c98b8e9c3c3f8e2e6a8f2d1ffcfefda344c4680...0.5.0-rc1) + +**Closed issues:** + +- Investigate exposing additional optimizations [\#28](https://github.com/datafusion-contrib/datafusion-python/issues/28) +- Use custom allocator in Python build [\#27](https://github.com/datafusion-contrib/datafusion-python/issues/27) +- Why is pandas a requirement? [\#24](https://github.com/datafusion-contrib/datafusion-python/issues/24) +- Unable to build [\#18](https://github.com/datafusion-contrib/datafusion-python/issues/18) +- Setup CI against multiple Python version [\#6](https://github.com/datafusion-contrib/datafusion-python/issues/6) From 79c22d6d6c0809e7e93a0a23249baa516dbd8d6f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 4 Dec 2024 05:56:11 -0500 Subject: [PATCH 081/248] Search default window functions if no session context was provided (#963) * Search default window functions if no session context was provided * Check if value is None because [] don't trigger the intended behavior --- python/datafusion/dataframe.py | 6 +++--- python/datafusion/functions.py | 1 + src/functions.rs | 11 +++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index e283f590e..0b38db924 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -446,14 +446,14 @@ def join( left_on = join_keys[0] right_on = join_keys[1] - if on: - if left_on or right_on: + if on is not None: + if left_on is not None or right_on is not None: raise ValueError( "`left_on` or `right_on` should not provided with `on`" ) left_on = on right_on = on - elif left_on or right_on: + elif left_on is not None or right_on is not None: if left_on is None or right_on is None: raise ValueError("`left_on` and `right_on` should both be provided.") else: diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 15ad8822f..f3ee5c092 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -431,6 +431,7 @@ def window( partition_by = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) window_frame = window_frame.window_frame if window_frame is not None else None + ctx = ctx.ctx if ctx is not None else None return Expr(f.window(name, args, partition_by, order_by_raw, window_frame, ctx)) diff --git a/src/functions.rs b/src/functions.rs index e29c57f9b..5c450286f 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -16,6 +16,7 @@ // under the License. use datafusion::functions_aggregate::all_default_aggregate_functions; +use datafusion::functions_window::all_default_window_functions; use datafusion::logical_expr::ExprFunctionExt; use datafusion::logical_expr::WindowFrame; use pyo3::{prelude::*, wrap_pyfunction}; @@ -282,6 +283,16 @@ fn find_window_fn(name: &str, ctx: Option) -> PyResult Date: Tue, 7 Jan 2025 21:28:25 +0800 Subject: [PATCH 082/248] Add arrow cast (#962) * feat: add data_type parameter to expr_fn macro for arrow_cast function * feat: add arrow_cast function to cast expressions to specified data types * docs: add casting section to user guide with examples for arrow_cast function * test: add unit test for arrow_cast function to validate casting to Float64 and Int32 * fix: update arrow_cast function to accept Expr type for data_type parameter * fix: update test_arrow_cast to use literal casting for data types * fix: update arrow_cast function to accept string type for data_type parameter * fix: update arrow_cast function to accept Expr type for data_type parameter * fix: update test_arrow_cast to use literal for data type parameters * fix: update arrow_cast function to use arg_1 for datatype parameter * fix: update arrow_cast function to accept string type for data_type parameter * Revert "fix: update arrow_cast function to accept string type for data_type parameter" This reverts commit eba0d320820e8f3f9688781f27b2a5579c0e9949. * fix: update test_arrow_cast to cast literals to string type for arrow_cast function * Revert "fix: update test_arrow_cast to cast literals to string type for arrow_cast function" This reverts commit 856ff8c4cad0075c282089b5368a7c3fd17f03d8. * fix: update arrow_cast function to accept string type for data_type parameter * Revert "fix: update arrow_cast function to accept string type for data_type parameter" This reverts commit 9e1ced7fb56c8aec47bc9f540ea5686c7246f022. * fix: add utf8_literal function to create UTF8 literal expressions in tests * Revert "fix: add utf8_literal function to create UTF8 literal expressions in tests" This reverts commit 11ed6749e02ab7b34d47fa105961f088f9fc9245. * feat: add utf8_literal function to create UTF8 literal expressions * fix: update test_arrow_cast to use column 'b' * fix: enhance utf8_literal function to handle non-string values * Add description for utf8_literal vs literal * docs: clarify utf8_literal function documentation to explain use case * docs: add clarification comments for utf8_literal usage in arrow_cast tests * docs: implement ruff recommendation * fix ruff errors * docs: update examples to use utf8_literal in arrow_cast function * docs: correct typo in comment for utf8_literal usage in test_arrow_cast * docs: remove redundant comment in test_arrow_cast for clarity * refactor: rename utf8_literal to string_literal and add alias str_lit * docs: improve docstring for string_literal function for clarity * docs: update import statement to include str_lit alias for string_literal --- .../user-guide/common-operations/functions.rst | 13 ++++++++++++- python/datafusion/__init__.py | 13 +++++++++++++ python/datafusion/expr.py | 16 ++++++++++++++++ python/datafusion/functions.py | 6 ++++++ python/tests/test_functions.py | 18 +++++++++++++++++- src/functions.rs | 3 ++- 6 files changed, 66 insertions(+), 3 deletions(-) diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index ad71c72ac..12097be8f 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -38,7 +38,7 @@ DataFusion offers mathematical functions such as :py:func:`~datafusion.functions .. ipython:: python - from datafusion import col, literal + from datafusion import col, literal, string_literal, str_lit from datafusion import functions as f df.select( @@ -104,6 +104,17 @@ This also includes the functions for regular expressions like :py:func:`~datafus f.regexp_replace(col('"Name"'), literal("saur"), literal("fleur")).alias("flowers") ) +Casting +------- + +Casting expressions to different data types using :py:func:`~datafusion.functions.arrow_cast` + +.. ipython:: python + + df.select( + f.arrow_cast(col('"Total"'), string_literal("Float64")).alias("total_as_float"), + f.arrow_cast(col('"Total"'), str_lit("Int32")).alias("total_as_int") + ) Other ----- diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index e0bc57f44..7367b0d3b 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -107,6 +107,19 @@ def literal(value): return Expr.literal(value) +def string_literal(value): + """Create a UTF8 literal expression. + + It differs from `literal` which creates a UTF8view literal. + """ + return Expr.string_literal(value) + + +def str_lit(value): + """Alias for `string_literal`.""" + return string_literal(value) + + def lit(value): """Create a literal expression.""" return Expr.literal(value) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index b10724381..16add16f4 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -380,6 +380,22 @@ def literal(value: Any) -> Expr: value = pa.scalar(value) return Expr(expr_internal.Expr.literal(value)) + @staticmethod + def string_literal(value: str) -> Expr: + """Creates a new expression representing a UTF8 literal value. + + It is different from `literal` because it is pa.string() instead of + pa.string_view() + + This is needed for cases where DataFusion is expecting a UTF8 instead of + UTF8View literal, like in: + https://github.com/apache/datafusion/blob/86740bfd3d9831d6b7c1d0e1bf4a21d91598a0ac/datafusion/functions/src/core/arrow_cast.rs#L179 + """ + if isinstance(value, str): + value = pa.scalar(value, type=pa.string()) + return Expr(expr_internal.Expr.literal(value)) + return Expr.literal(value) + @staticmethod def column(value: str) -> Expr: """Creates a new expression representing a column.""" diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index f3ee5c092..c0097c6ab 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -82,6 +82,7 @@ "array_to_string", "array_union", "arrow_typeof", + "arrow_cast", "ascii", "asin", "asinh", @@ -1109,6 +1110,11 @@ def arrow_typeof(arg: Expr) -> Expr: return Expr(f.arrow_typeof(arg.expr)) +def arrow_cast(expr: Expr, data_type: Expr) -> Expr: + """Casts an expression to a specified data type.""" + return Expr(f.arrow_cast(expr.expr, data_type.expr)) + + def random() -> Expr: """Returns a random value in the range ``0.0 <= x < 1.0``.""" return Expr(f.random()) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 0d2fa8f94..5dce188ed 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -23,7 +23,7 @@ from datafusion import SessionContext, column from datafusion import functions as f -from datafusion import literal +from datafusion import literal, string_literal np.seterr(invalid="ignore") @@ -907,6 +907,22 @@ def test_temporal_functions(df): assert result.column(10) == pa.array([31, 26, 2], type=pa.float64()) +def test_arrow_cast(df): + df = df.select( + # we use `string_literal` to return utf8 instead of `literal` which returns + # utf8view because datafusion.arrow_cast expects a utf8 instead of utf8view + # https://github.com/apache/datafusion/blob/86740bfd3d9831d6b7c1d0e1bf4a21d91598a0ac/datafusion/functions/src/core/arrow_cast.rs#L179 + f.arrow_cast(column("b"), string_literal("Float64")).alias("b_as_float"), + f.arrow_cast(column("b"), string_literal("Int32")).alias("b_as_int"), + ) + result = df.collect() + assert len(result) == 1 + result = result[0] + + assert result.column(0) == pa.array([4.0, 5.0, 6.0], type=pa.float64()) + assert result.column(1) == pa.array([4, 5, 6], type=pa.int32()) + + def test_case(df): df = df.select( f.case(column("b")).when(literal(4), literal(10)).otherwise(literal(8)), diff --git a/src/functions.rs b/src/functions.rs index 5c450286f..ccc1981bd 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -400,7 +400,6 @@ macro_rules! expr_fn { } }; } - /// Generates a [pyo3] wrapper for [datafusion::functions::expr_fn] /// /// These functions take a single `Vec` argument using `pyo3(signature = (*args))`. @@ -575,6 +574,7 @@ expr_fn_vec!(r#struct); // Use raw identifier since struct is a keyword expr_fn_vec!(named_struct); expr_fn!(from_unixtime, unixtime); expr_fn!(arrow_typeof, arg_1); +expr_fn!(arrow_cast, arg_1 datatype); expr_fn!(random); // Array Functions @@ -867,6 +867,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(range))?; m.add_wrapped(wrap_pyfunction!(array_agg))?; m.add_wrapped(wrap_pyfunction!(arrow_typeof))?; + m.add_wrapped(wrap_pyfunction!(arrow_cast))?; m.add_wrapped(wrap_pyfunction!(ascii))?; m.add_wrapped(wrap_pyfunction!(asin))?; m.add_wrapped(wrap_pyfunction!(asinh))?; From 85fe35cf433c2168fee40dc48bdaa80126bf4a42 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 7 Jan 2025 05:30:13 -0800 Subject: [PATCH 083/248] Fix small issues in pyproject.toml (#976) * Fix small issues in pyproject.toml * Update classifiers --- pyproject.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d327c0ec1..98bda5aae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ readme = "README.md" license = { file = "LICENSE.txt" } requires-python = ">=3.7" keywords = ["datafusion", "dataframe", "rust", "query-engine"] -classifier = [ +classifiers = [ "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", @@ -39,10 +39,14 @@ classifier = [ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python", "Programming Language :: Rust", ] dependencies = ["pyarrow>=11.0.0", "typing-extensions;python_version<'3.13'"] +dynamic = ["version"] [project.urls] homepage = "https://datafusion.apache.org/python" From 63b13da4bccd66cb474186ebc2c4a1f8ba82230f Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Tue, 7 Jan 2025 14:34:44 +0100 Subject: [PATCH 084/248] chore: set validation and typehint (#983) --- python/datafusion/context.py | 13 ++++++++++++- src/context.rs | 4 ++-- src/dataframe.rs | 21 +-------------------- src/utils.rs | 21 +++++++++++++++++++++ 4 files changed, 36 insertions(+), 23 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index a07b5d175..3fa133346 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -63,6 +63,15 @@ def __arrow_c_array__( # noqa: D105 ) -> tuple[object, object]: ... +class TableProviderExportable(Protocol): + """Type hint for object that has __datafusion_table_provider__ PyCapsule. + + https://datafusion.apache.org/python/user-guide/io/table_provider.html + """ + + def __datafusion_table_provider__(self) -> object: ... # noqa: D105 + + class SessionConfig: """Session configuration options.""" @@ -685,7 +694,9 @@ def deregister_table(self, name: str) -> None: """Remove a table from the session.""" self.ctx.deregister_table(name) - def register_table_provider(self, name: str, provider: Any) -> None: + def register_table_provider( + self, name: str, provider: TableProviderExportable + ) -> None: """Register a table provider. This table provider must have a method called ``__datafusion_table_provider__`` diff --git a/src/context.rs b/src/context.rs index 8675e97df..0512285a7 100644 --- a/src/context.rs +++ b/src/context.rs @@ -43,7 +43,7 @@ use crate::store::StorageContexts; use crate::udaf::PyAggregateUDF; use crate::udf::PyScalarUDF; use crate::udwf::PyWindowUDF; -use crate::utils::{get_tokio_runtime, wait_for_future}; +use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; @@ -576,7 +576,7 @@ impl PySessionContext { if provider.hasattr("__datafusion_table_provider__")? { let capsule = provider.getattr("__datafusion_table_provider__")?.call0()?; let capsule = capsule.downcast::()?; - // validate_pycapsule(capsule, "arrow_array_stream")?; + validate_pycapsule(capsule, "datafusion_table_provider")?; let provider = unsafe { capsule.reference::() }; let provider: ForeignTableProvider = provider.into(); diff --git a/src/dataframe.rs b/src/dataframe.rs index e7d6ca6d6..fcb46a756 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -44,7 +44,7 @@ use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; -use crate::utils::{get_tokio_runtime, wait_for_future}; +use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; use crate::{ errors::DataFusionError, expr::{sort_expr::PySortExpr, PyExpr}, @@ -724,22 +724,3 @@ fn record_batch_into_schema( RecordBatch::try_new(schema, data_arrays) } - -fn validate_pycapsule(capsule: &Bound, name: &str) -> PyResult<()> { - let capsule_name = capsule.name()?; - if capsule_name.is_none() { - return Err(PyValueError::new_err( - "Expected schema PyCapsule to have name set.", - )); - } - - let capsule_name = capsule_name.unwrap().to_str()?; - if capsule_name != name { - return Err(PyValueError::new_err(format!( - "Expected name '{}' in PyCapsule, instead got '{}'", - name, capsule_name - ))); - } - - Ok(()) -} diff --git a/src/utils.rs b/src/utils.rs index 7fb23cafe..795589752 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -18,7 +18,9 @@ use crate::errors::DataFusionError; use crate::TokioRuntime; use datafusion::logical_expr::Volatility; +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; +use pyo3::types::PyCapsule; use std::future::Future; use std::sync::OnceLock; use tokio::runtime::Runtime; @@ -58,3 +60,22 @@ pub(crate) fn parse_volatility(value: &str) -> Result, name: &str) -> PyResult<()> { + let capsule_name = capsule.name()?; + if capsule_name.is_none() { + return Err(PyValueError::new_err( + "Expected schema PyCapsule to have name set.", + )); + } + + let capsule_name = capsule_name.unwrap().to_str()?; + if capsule_name != name { + return Err(PyValueError::new_err(format!( + "Expected name '{}' in PyCapsule, instead got '{}'", + name, capsule_name + ))); + } + + Ok(()) +} From 389164aa90c8dbe689f2e8eac0677ef2b80aaad9 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Thu, 9 Jan 2025 05:39:31 +0800 Subject: [PATCH 085/248] feat: support enable_url_table config (#980) * feat: support enable_url_table config * change enable_url_table as method * Remove whitespace --------- Co-authored-by: Tim Saucer --- examples/create-context.py | 3 +++ python/datafusion/context.py | 11 +++++++++++ src/context.rs | 6 ++++++ 3 files changed, 20 insertions(+) diff --git a/examples/create-context.py b/examples/create-context.py index 3184d4085..11525d8b8 100644 --- a/examples/create-context.py +++ b/examples/create-context.py @@ -37,3 +37,6 @@ ) ctx = SessionContext(config, runtime) print(ctx) + +ctx = ctx.enable_url_table() +print(ctx) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 3fa133346..6d7f574c4 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -481,6 +481,17 @@ def __init__( self.ctx = SessionContextInternal(config, runtime) + def enable_url_table(self) -> "SessionContext": + """Control if local files can be queried as tables. + + Returns: + A new :py:class:`SessionContext` object with url table enabled. + """ + klass = self.__class__ + obj = klass.__new__(klass) + obj.ctx = self.ctx.enable_url_table() + return obj + def register_object_store( self, schema: str, store: Any, host: str | None = None ) -> None: diff --git a/src/context.rs b/src/context.rs index 0512285a7..88c90e0fd 100644 --- a/src/context.rs +++ b/src/context.rs @@ -299,6 +299,12 @@ impl PySessionContext { }) } + pub fn enable_url_table(&self) -> PyResult { + Ok(PySessionContext { + ctx: self.ctx.clone().enable_url_table(), + }) + } + /// Register an object store with the given name #[pyo3(signature = (scheme, store, host=None))] pub fn register_object_store( From 4b262be15202f5efb9a963faf66452f7fb0bbad3 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Thu, 9 Jan 2025 03:54:38 -0800 Subject: [PATCH 086/248] Support async iteration of RecordBatchStream (#975) * Support async iteration of RecordBatchStream * use __anext__ * use await * fix failing test * Since we are raising an error instead of returning a None, we can update the type hint. --------- Co-authored-by: Tim Saucer --- Cargo.lock | 14 +++++++++ Cargo.toml | 3 +- python/datafusion/record_batch.py | 16 ++++++---- python/tests/test_dataframe.py | 4 +-- src/record_batch.rs | 51 +++++++++++++++++++++++++------ 5 files changed, 69 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d1f291be9..352771cdb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1303,6 +1303,7 @@ dependencies = [ "prost", "prost-types", "pyo3", + "pyo3-async-runtimes", "pyo3-build-config", "tokio", "url", @@ -2672,6 +2673,19 @@ dependencies = [ "unindent", ] +[[package]] +name = "pyo3-async-runtimes" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2529f0be73ffd2be0cc43c013a640796558aa12d7ca0aab5cc14f375b4733031" +dependencies = [ + "futures", + "once_cell", + "pin-project-lite", + "pyo3", + "tokio", +] + [[package]] name = "pyo3-build-config" version = "0.22.6" diff --git a/Cargo.toml b/Cargo.toml index 703fc5a26..d28844685 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ substrait = ["dep:datafusion-substrait"] [dependencies] tokio = { version = "1.41", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]} arrow = { version = "53", features = ["pyarrow"] } datafusion = { version = "43.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } datafusion-substrait = { version = "43.0.0", optional = true } @@ -60,4 +61,4 @@ crate-type = ["cdylib", "rlib"] [profile.release] lto = true -codegen-units = 1 \ No newline at end of file +codegen-units = 1 diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py index 44936f7d8..75e58998f 100644 --- a/python/datafusion/record_batch.py +++ b/python/datafusion/record_batch.py @@ -57,20 +57,24 @@ def __init__(self, record_batch_stream: df_internal.RecordBatchStream) -> None: """This constructor is typically not called by the end user.""" self.rbs = record_batch_stream - def next(self) -> RecordBatch | None: + def next(self) -> RecordBatch: """See :py:func:`__next__` for the iterator function.""" - try: - next_batch = next(self) - except StopIteration: - return None + return next(self) - return next_batch + async def __anext__(self) -> RecordBatch: + """Async iterator function.""" + next_batch = await self.rbs.__anext__() + return RecordBatch(next_batch) def __next__(self) -> RecordBatch: """Iterator function.""" next_batch = next(self.rbs) return RecordBatch(next_batch) + def __aiter__(self) -> typing_extensions.Self: + """Async iterator function.""" + return self + def __iter__(self) -> typing_extensions.Self: """Iterator function.""" return self diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index b82f95e35..e3bd1b2a5 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -761,8 +761,8 @@ def test_execution_plan(aggregate_df): batch = stream.next() assert batch is not None # there should be no more batches - batch = stream.next() - assert batch is None + with pytest.raises(StopIteration): + stream.next() def test_repartition(df): diff --git a/src/record_batch.rs b/src/record_batch.rs index 427807f22..eacdb5867 100644 --- a/src/record_batch.rs +++ b/src/record_batch.rs @@ -15,13 +15,17 @@ // specific language governing permissions and limitations // under the License. +use std::sync::Arc; + use crate::utils::wait_for_future; use datafusion::arrow::pyarrow::ToPyArrow; use datafusion::arrow::record_batch::RecordBatch; use datafusion::physical_plan::SendableRecordBatchStream; use futures::StreamExt; +use pyo3::exceptions::{PyStopAsyncIteration, PyStopIteration}; use pyo3::prelude::*; use pyo3::{pyclass, pymethods, PyObject, PyResult, Python}; +use tokio::sync::Mutex; #[pyclass(name = "RecordBatch", module = "datafusion", subclass)] pub struct PyRecordBatch { @@ -43,31 +47,58 @@ impl From for PyRecordBatch { #[pyclass(name = "RecordBatchStream", module = "datafusion", subclass)] pub struct PyRecordBatchStream { - stream: SendableRecordBatchStream, + stream: Arc>, } impl PyRecordBatchStream { pub fn new(stream: SendableRecordBatchStream) -> Self { - Self { stream } + Self { + stream: Arc::new(Mutex::new(stream)), + } } } #[pymethods] impl PyRecordBatchStream { - fn next(&mut self, py: Python) -> PyResult> { - let result = self.stream.next(); - match wait_for_future(py, result) { - None => Ok(None), - Some(Ok(b)) => Ok(Some(b.into())), - Some(Err(e)) => Err(e.into()), - } + fn next(&mut self, py: Python) -> PyResult { + let stream = self.stream.clone(); + wait_for_future(py, next_stream(stream, true)) } - fn __next__(&mut self, py: Python) -> PyResult> { + fn __next__(&mut self, py: Python) -> PyResult { self.next(py) } + fn __anext__<'py>(&'py self, py: Python<'py>) -> PyResult> { + let stream = self.stream.clone(); + pyo3_async_runtimes::tokio::future_into_py(py, next_stream(stream, false)) + } + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { slf } + + fn __aiter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } +} + +async fn next_stream( + stream: Arc>, + sync: bool, +) -> PyResult { + let mut stream = stream.lock().await; + match stream.next().await { + Some(Ok(batch)) => Ok(batch.into()), + Some(Err(e)) => Err(e.into()), + None => { + // Depending on whether the iteration is sync or not, we raise either a + // StopIteration or a StopAsyncIteration + if sync { + Err(PyStopIteration::new_err("stream exhausted")) + } else { + Err(PyStopAsyncIteration::new_err("stream exhausted")) + } + } + } } From db1bc62999f559d515a6a8a7f2194ab6d20b3035 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 9 Jan 2025 12:34:23 -0500 Subject: [PATCH 087/248] Chore/upgrade datafusion 44 (#973) * Bump DataFusion version to 44 * Trait definition for plan properties now returns LexOrdering * find_df_window_func was removed upstream * Prepare and Execute variants were removed from LogicalPlan * Substrait functions now take SessionState instead of SessionContext * Remove unused import * RuntimeConfig is now deprecated * Switch from RuntimeConfig to RuntimeEnvBuilder * Update return types on unit tests * DF 44 changes the execution plan properties to have boundedness and emission type * Initcap now returns stringview * Bump datafusion version in example --- Cargo.lock | 783 +++++++++--------- Cargo.toml | 9 +- README.md | 2 +- benchmarks/db-benchmark/groupby-datafusion.py | 6 +- benchmarks/tpch/tpch.py | 2 +- docs/source/user-guide/configuration.rst | 8 +- examples/create-context.py | 4 +- examples/ffi-table-provider/Cargo.toml | 4 +- python/datafusion/__init__.py | 4 +- python/datafusion/context.py | 55 +- python/tests/test_context.py | 10 +- python/tests/test_functions.py | 18 +- src/context.rs | 64 +- src/dataset_exec.rs | 20 +- src/functions.rs | 11 +- src/lib.rs | 2 +- src/sql/logical.rs | 2 - src/substrait.rs | 6 +- src/udwf.rs | 6 - 19 files changed, 529 insertions(+), 487 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 352771cdb..105cc30c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,9 +111,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "android-tzdata" @@ -132,20 +132,20 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.93" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" [[package]] name = "apache-avro" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ceb7c683b2f8f40970b70e39ff8be514c95b96fcb9c4af87e1ed2cb2e10801a0" +checksum = "1aef82843a0ec9f8b19567445ad2421ceeb1d711514384bdd3d49fe37102ee13" dependencies = [ - "bzip2", + "bigdecimal", + "bzip2 0.4.4", "crc32fast", "digest", - "lazy_static", "libflate", "log", "num-bigint", @@ -153,15 +153,16 @@ dependencies = [ "rand", "regex-lite", "serde", + "serde_bytes", "serde_json", "snap", - "strum 0.25.0", - "strum_macros 0.25.3", + "strum", + "strum_macros", "thiserror 1.0.69", "typed-builder", "uuid", "xz2", - "zstd 0.12.4", + "zstd", ] [[package]] @@ -415,16 +416,15 @@ version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" dependencies = [ - "bzip2", + "bzip2 0.4.4", "flate2", "futures-core", - "futures-io", "memchr", "pin-project-lite", "tokio", "xz2", - "zstd 0.13.2", - "zstd-safe 7.2.1", + "zstd", + "zstd-safe", ] [[package]] @@ -444,18 +444,18 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] name = "async-trait" -version = "0.1.83" +version = "0.1.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" +checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -506,6 +506,20 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bigdecimal" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f31f3af01c5c65a07985c804d3366560e6fa7883d640a122819b14ec327482c" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -598,6 +612,16 @@ dependencies = [ "libc", ] +[[package]] +name = "bzip2" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bafdbf26611df8c14810e268ddceda071c297570a5fb360ceddf617fe417ef58" +dependencies = [ + "bzip2-sys", + "libc", +] + [[package]] name = "bzip2-sys" version = "0.1.11+1.0.8" @@ -611,9 +635,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.2" +version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc" +checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7" dependencies = [ "jobserver", "libc", @@ -634,9 +658,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.38" +version = "0.4.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" dependencies = [ "android-tzdata", "iana-time-zone", @@ -681,8 +705,8 @@ version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" dependencies = [ - "strum 0.26.3", - "strum_macros 0.26.4", + "strum", + "strum_macros", "unicode-width", ] @@ -708,9 +732,9 @@ dependencies = [ [[package]] name = "const_panic" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "013b6c2c3a14d678f38cd23994b02da3a1a1b6a5d1eedddfe63a5a5f11b13a81" +checksum = "2459fc9262a1aa204eb4b5764ad4f189caec88aea9634389c0a25f8be7f6265e" [[package]] name = "constant_time_eq" @@ -778,18 +802,18 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.13" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" -version = "0.8.20" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" @@ -850,11 +874,10 @@ dependencies = [ [[package]] name = "datafusion" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbba0799cf6913b456ed07a94f0f3b6e12c62a5d88b10809e2284a0f2b915c05" +checksum = "014fc8c384ecacedaabb3bc8359c2a6c6e9d8f7bea65be3434eccacfc37f52d9" dependencies = [ - "ahash", "apache-avro", "arrow", "arrow-array", @@ -863,7 +886,7 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "bzip2", + "bzip2 0.5.0", "chrono", "dashmap", "datafusion-catalog", @@ -874,6 +897,7 @@ dependencies = [ "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", + "datafusion-functions-table", "datafusion-functions-window", "datafusion-optimizer", "datafusion-physical-expr", @@ -884,19 +908,14 @@ dependencies = [ "flate2", "futures", "glob", - "half", - "hashbrown 0.14.5", - "indexmap", "itertools", "log", "num-traits", - "num_cpus", "object_store", "parking_lot", "parquet", - "paste", - "pin-project-lite", "rand", + "regex", "sqlparser", "tempfile", "tokio", @@ -904,14 +923,14 @@ dependencies = [ "url", "uuid", "xz2", - "zstd 0.13.2", + "zstd", ] [[package]] name = "datafusion-catalog" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7493c5c2d40eec435b13d92e5703554f4efc7059451fcb8d3a79580ff0e45560" +checksum = "ee60d33e210ef96070377ae667ece7caa0e959c8387496773d4a1a72f1a5012e" dependencies = [ "arrow-schema", "async-trait", @@ -924,9 +943,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24953049ebbd6f8964f91f60aa3514e121b5e81e068e33b60e77815ab369b25c" +checksum = "0b42b7d720fe21ed9cca2ebb635f3f13a12cfab786b41e0fba184fb2e620525b" dependencies = [ "ahash", "apache-avro", @@ -934,44 +953,48 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-schema", - "chrono", "half", "hashbrown 0.14.5", "indexmap", - "instant", "libc", - "num_cpus", + "log", "object_store", "parquet", "paste", "pyo3", + "recursive", "sqlparser", "tokio", + "web-time", ] [[package]] name = "datafusion-common-runtime" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f06df4ef76872e11c924d3c814fd2a8dd09905ed2e2195f71c857d78abd19685" +checksum = "72fbf14d4079f7ce5306393084fe5057dddfdc2113577e0049310afa12e94281" dependencies = [ "log", "tokio", ] +[[package]] +name = "datafusion-doc" +version = "44.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c278dbd64860ed0bb5240fc1f4cb6aeea437153910aea69bcf7d5a8d6d0454f3" + [[package]] name = "datafusion-execution" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bbdcb628d690f3ce5fea7de81642b514486d58ff9779a51f180a69a4eadb361" +checksum = "e22cb02af47e756468b3cbfee7a83e3d4f2278d452deb4b033ba933c75169486" dependencies = [ "arrow", - "chrono", "dashmap", "datafusion-common", "datafusion-expr", "futures", - "hashbrown 0.14.5", "log", "object_store", "parking_lot", @@ -982,45 +1005,41 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8036495980e3131f706b7d33ab00b4492d73dc714e3cb74d11b50f9602a73246" +checksum = "62298eadb1d15b525df1315e61a71519ffc563d41d5c3b2a30fda2d70f77b93c" dependencies = [ - "ahash", "arrow", - "arrow-array", - "arrow-buffer", "chrono", "datafusion-common", + "datafusion-doc", "datafusion-expr-common", "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", "indexmap", "paste", + "recursive", "serde_json", "sqlparser", - "strum 0.26.3", - "strum_macros 0.26.4", ] [[package]] name = "datafusion-expr-common" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4da0f3cb4669f9523b403d6b5a0ec85023e0ab3bf0183afd1517475b3e64fdd2" +checksum = "dda7f73c5fc349251cd3dcb05773c5bf55d2505a698ef9d38dfc712161ea2f55" dependencies = [ "arrow", "datafusion-common", "itertools", - "paste", ] [[package]] name = "datafusion-ffi" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e923c459b53a26d92a8806d1f6a37fdf48bde51507a39eaed6f42a60f2bfd160" +checksum = "114e944790756b84c2cc5971eae24f5430980149345601939ac222885d4db5f7" dependencies = [ "abi_stable", "arrow", @@ -1028,7 +1047,6 @@ dependencies = [ "async-trait", "datafusion", "datafusion-proto", - "doc-comment", "futures", "log", "prost", @@ -1036,9 +1054,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52c4012648b34853e40a2c6bcaa8772f837831019b68aca384fb38436dba162" +checksum = "fd197f3b2975424d3a4898ea46651be855a46721a56727515dbd5c9e2fb597da" dependencies = [ "arrow", "arrow-buffer", @@ -1047,8 +1065,11 @@ dependencies = [ "blake3", "chrono", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", "hashbrown 0.14.5", "hex", "itertools", @@ -1063,44 +1084,44 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5b8bb624597ba28ed7446df4a9bd7c7a7bde7c578b6b527da3f47371d5f6741" +checksum = "aabbe48fba18f9981b134124381bee9e46f93518b8ad2f9721ee296cef5affb9" dependencies = [ "ahash", "arrow", "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions-aggregate-common", + "datafusion-macros", "datafusion-physical-expr", "datafusion-physical-expr-common", "half", - "indexmap", "log", "paste", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb06208fc470bc8cf1ce2d9a1159d42db591f2c7264a8c1776b53ad8f675143" +checksum = "d7a3fefed9c8c11268d446d924baca8cabf52fe32f73fdaa20854bac6473590c" dependencies = [ "ahash", "arrow", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", - "rand", ] [[package]] name = "datafusion-functions-nested" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca25bbb87323716d05e54114666e942172ccca23c5a507e9c7851db6e965317" +checksum = "6360f27464fab857bec698af39b2ae331dc07c8bf008fb4de387a19cdc6815a5" dependencies = [ "arrow", "arrow-array", @@ -1116,18 +1137,35 @@ dependencies = [ "itertools", "log", "paste", - "rand", +] + +[[package]] +name = "datafusion-functions-table" +version = "44.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c35c070eb705c12795dab399c3809f4dfbc290678c624d3989490ca9b8449c1" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", ] [[package]] name = "datafusion-functions-window" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ae23356c634e54c59f7c51acb7a5b9f6240ffb2cf997049a1a24a8a88598dbe" +checksum = "52229bca26b590b140900752226c829f15fc1a99840e1ca3ce1a9534690b82a8" dependencies = [ "datafusion-common", + "datafusion-doc", "datafusion-expr", "datafusion-functions-window-common", + "datafusion-macros", "datafusion-physical-expr", "datafusion-physical-expr-common", "log", @@ -1136,48 +1174,54 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4b3d6ff7794acea026de36007077a06b18b89e4f9c3fea7f2215f9f7dd9059b" +checksum = "367befc303b64a668a10ae6988a064a9289e1999e71a7f8e526b6e14d6bdd9d6" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", ] +[[package]] +name = "datafusion-macros" +version = "44.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5de3c8f386ea991696553afe241a326ecbc3c98a12c562867e4be754d3a060c" +dependencies = [ + "quote", + "syn 2.0.95", +] + [[package]] name = "datafusion-optimizer" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec6241eb80c595fa0e1a8a6b69686b5cf3bd5fdacb8319582a0943b0bd788aa" +checksum = "53b520413906f755910422b016fb73884ae6e9e1b376de4f9584b6c0e031da75" dependencies = [ "arrow", - "async-trait", "chrono", "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown 0.14.5", "indexmap", "itertools", "log", - "paste", + "recursive", + "regex", "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3370357b8fc75ec38577700644e5d1b0bc78f38babab99c0b8bd26bafb3e4335" +checksum = "acd6ddc378f6ad19af95ccd6790dec8f8e1264bc4c70e99ddc1830c1a1c78ccd" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", - "arrow-ord", "arrow-schema", - "arrow-string", - "chrono", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1194,39 +1238,40 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8b7734d94bf2fa6f6e570935b0ddddd8421179ce200065be97874e13d46a47b" +checksum = "06e6c05458eccd74b4c77ed6a1fe63d52434240711de7f6960034794dad1caf5" dependencies = [ "ahash", "arrow", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", - "rand", + "itertools", ] [[package]] name = "datafusion-physical-optimizer" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eee8c479522df21d7b395640dff88c5ed05361852dce6544d7c98e9dbcebffe" +checksum = "9dc3a82190f49c37d377f31317e07ab5d7588b837adadba8ac367baad5dc2351" dependencies = [ "arrow", - "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-expr-common", "datafusion-physical-expr", "datafusion-physical-plan", "itertools", + "log", + "recursive", ] [[package]] name = "datafusion-physical-plan" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17e1fc2e2c239d14e8556f2622b19a726bf6bc6962cc00c71fc52626274bee24" +checksum = "6a6608bc9844b4ddb5ed4e687d173e6c88700b1d0482f43894617d18a1fe75da" dependencies = [ "ahash", "arrow", @@ -1240,7 +1285,6 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", - "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -1250,18 +1294,16 @@ dependencies = [ "indexmap", "itertools", "log", - "once_cell", "parking_lot", "pin-project-lite", - "rand", "tokio", ] [[package]] name = "datafusion-proto" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f730f7fc5a20134d4e5ecdf7bbf392002ac58163d58423ea28a702dc077b06e1" +checksum = "8e23b0998195e495bfa7b37cdceb317129a6c40522219f6872d2e0c9ae9f4fcb" dependencies = [ "arrow", "chrono", @@ -1275,14 +1317,12 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12c225fe49e4f943e35446b263613ada7a9e9f8d647544e6b07037b9803567df" +checksum = "cfc59992a29eed2d2c1dd779deac99083b217774ebcf90ee121840607a4d866f" dependencies = [ "arrow", - "chrono", "datafusion-common", - "object_store", "prost", ] @@ -1294,7 +1334,6 @@ dependencies = [ "async-trait", "datafusion", "datafusion-ffi", - "datafusion-functions-window-common", "datafusion-proto", "datafusion-substrait", "futures", @@ -1312,30 +1351,32 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63e3a4ed41dbee20a5d947a59ca035c225d67dc9cbe869c10f66dcdf25e7ce51" +checksum = "6a884061c79b33d0c8e84a6f4f4be8bdc12c0f53f5af28ddf5d6d95ac0b15fdc" dependencies = [ "arrow", "arrow-array", "arrow-schema", + "bigdecimal", "datafusion-common", "datafusion-expr", "indexmap", "log", + "recursive", "regex", "sqlparser", - "strum 0.26.3", ] [[package]] name = "datafusion-substrait" -version = "43.0.0" +version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b9c768d2b4c4485c43afbaeeb86dd1f2ac3fb34a9e6e8c8b06180d2a223d5ba" +checksum = "d2ec36dd38512b1ecc7a3bb92e72046b944611b2f0d709445c1e51b0143bffd4" dependencies = [ "arrow-buffer", "async-recursion", + "async-trait", "chrono", "datafusion", "itertools", @@ -1365,15 +1406,9 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] -[[package]] -name = "doc-comment" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" - [[package]] name = "dyn-clone" version = "1.0.17" @@ -1404,9 +1439,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fixedbitset" @@ -1416,9 +1451,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" -version = "24.3.25" +version = "24.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -1505,7 +1540,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -1578,9 +1613,9 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "h2" @@ -1628,24 +1663,12 @@ version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - [[package]] name = "hex" version = "0.4.3" @@ -1654,9 +1677,9 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "http" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" dependencies = [ "bytes", "fnv", @@ -1700,9 +1723,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.5.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97818827ef4f364230e16705d4706e2897df2bb60617d6ca15d598025a3c481f" +checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" dependencies = [ "bytes", "futures-channel", @@ -1720,9 +1743,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.3" +version = "0.27.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" +checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" dependencies = [ "futures-util", "http", @@ -1893,7 +1916,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -1919,9 +1942,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -1933,18 +1956,6 @@ version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" -[[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", -] - [[package]] name = "integer-encoding" version = "3.0.4" @@ -1983,9 +1994,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.74" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ "once_cell", "wasm-bindgen", @@ -1999,9 +2010,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -2012,9 +2023,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -2023,9 +2034,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ "lexical-util", "static_assertions", @@ -2033,18 +2044,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "1.0.3" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" dependencies = [ "lexical-util", "lexical-write-integer", @@ -2053,9 +2064,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" dependencies = [ "lexical-util", "static_assertions", @@ -2063,9 +2074,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.167" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libflate" @@ -2119,9 +2130,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" [[package]] name = "litemap" @@ -2207,9 +2218,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" dependencies = [ "adler2", ] @@ -2253,6 +2264,7 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", + "serde", ] [[package]] @@ -2305,36 +2317,27 @@ dependencies = [ "libm", ] -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "object" -version = "0.36.5" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eb4c22c6154a1e759d7099f9ffad7cc5ef8245f9efbab4a41b92623079c82f3" +checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" dependencies = [ "async-trait", "base64 0.22.1", "bytes", "chrono", "futures", + "httparse", "humantime", "hyper", "itertools", @@ -2431,7 +2434,7 @@ dependencies = [ "thrift", "tokio", "twox-hash", - "zstd 0.13.2", + "zstd", "zstd-sys", ] @@ -2466,7 +2469,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" dependencies = [ - "heck 0.5.0", + "heck", "itertools", "prost", "prost-types", @@ -2505,18 +2508,18 @@ dependencies = [ [[package]] name = "phf" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ "phf_shared", ] [[package]] name = "phf_codegen" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ "phf_generator", "phf_shared", @@ -2524,9 +2527,9 @@ dependencies = [ [[package]] name = "phf_generator" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared", "rand", @@ -2534,18 +2537,18 @@ dependencies = [ [[package]] name = "phf_shared" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" dependencies = [ "siphasher", ] [[package]] name = "pin-project-lite" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -2576,12 +2579,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.25" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033" +checksum = "483f8c21f64f3ea09fe0f30f5d48c3e8eefe5dac9129f0075f76593b4c1da705" dependencies = [ "proc-macro2", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -2595,9 +2598,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec" dependencies = [ "bytes", "prost-derive", @@ -2605,12 +2608,11 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" +checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b" dependencies = [ - "bytes", - "heck 0.5.0", + "heck", "itertools", "log", "multimap", @@ -2620,28 +2622,28 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.90", + "syn 2.0.95", "tempfile", ] [[package]] name = "prost-derive" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" dependencies = [ "anyhow", "itertools", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] name = "prost-types" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" +checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc" dependencies = [ "prost", ] @@ -2655,6 +2657,15 @@ dependencies = [ "cmake", ] +[[package]] +name = "psm" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +dependencies = [ + "cc", +] + [[package]] name = "pyo3" version = "0.22.6" @@ -2715,7 +2726,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -2724,11 +2735,11 @@ version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -2739,9 +2750,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" -version = "0.36.2" +version = "0.37.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe" +checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003" dependencies = [ "memchr", "serde", @@ -2760,7 +2771,7 @@ dependencies = [ "rustc-hash", "rustls", "socket2", - "thiserror 2.0.3", + "thiserror 2.0.10", "tokio", "tracing", ] @@ -2779,7 +2790,7 @@ dependencies = [ "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.3", + "thiserror 2.0.10", "tinyvec", "tracing", "web-time", @@ -2787,9 +2798,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.7" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d5a626c6807713b15cac82a6acaccd6043c9a5408c24baae07611fec3f243da" +checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" dependencies = [ "cfg_aliases", "libc", @@ -2801,9 +2812,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.37" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] @@ -2838,11 +2849,31 @@ dependencies = [ "getrandom", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.95", +] + [[package]] name = "redox_syscall" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ "bitflags 2.6.0", ] @@ -2903,9 +2934,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.9" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ "base64 0.22.1", "bytes", @@ -2937,6 +2968,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", + "tower", "tower-service", "url", "wasm-bindgen", @@ -2990,22 +3022,22 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.41" +version = "0.38.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" +checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" dependencies = [ "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "rustls" -version = "0.23.19" +version = "0.23.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "934b404430bb06b3fae2cba809eb45a1ab1aecd64491213d7c3301b88393f8d1" +checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" dependencies = [ "once_cell", "ring", @@ -3038,9 +3070,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" dependencies = [ "web-time", ] @@ -3058,9 +3090,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" @@ -3107,7 +3139,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -3118,9 +3150,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "3.0.1" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1415a607e92bec364ea2cf9264646dcce0f91e6d65281bd6f2819cca3bf39c8" +checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" dependencies = [ "bitflags 2.6.0", "core-foundation", @@ -3131,9 +3163,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.1" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" +checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" dependencies = [ "core-foundation-sys", "libc", @@ -3141,9 +3173,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.23" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" dependencies = [ "serde", ] @@ -3156,22 +3188,31 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] +[[package]] +name = "serde_bytes" +version = "0.11.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "387cc504cb06bb40a96c8e04e951fe01854cf6bc921053c954e4a606d9675c6a" +dependencies = [ + "serde", +] + [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -3182,14 +3223,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] name = "serde_json" -version = "1.0.133" +version = "1.0.135" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9" dependencies = [ "itoa", "memchr", @@ -3206,7 +3247,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -3253,9 +3294,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "siphasher" -version = "0.3.11" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "slab" @@ -3287,10 +3328,10 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -3317,9 +3358,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fe11944a61da0da3f592e19a45ebe5ab92dc14a779907ff1f08fbb797bfefc7" +checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" dependencies = [ "log", "sqlparser_derive", @@ -3327,13 +3368,13 @@ dependencies = [ [[package]] name = "sqlparser_derive" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -3343,38 +3384,29 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] -name = "static_assertions" -version = "1.1.0" +name = "stacker" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] [[package]] -name = "strum" -version = "0.25.0" +name = "static_assertions" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "strum" version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" -dependencies = [ - "strum_macros 0.26.4", -] - -[[package]] -name = "strum_macros" -version = "0.25.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.90", -] [[package]] name = "strum_macros" @@ -3382,20 +3414,20 @@ version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "rustversion", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] name = "substrait" -version = "0.45.5" +version = "0.50.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a127ae9d8e443cea5c2122eb2ffe5fe489e802a1e746a09c5a5cb59d074c0aeb" +checksum = "b1772d041c37cc7e6477733c76b2acf4ee36bd52b2ae4d9ea0ec9c87d003db32" dependencies = [ - "heck 0.5.0", + "heck", "pbjson", "pbjson-build", "pbjson-types", @@ -3410,7 +3442,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.90", + "syn 2.0.95", "typify", "walkdir", ] @@ -3434,9 +3466,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.90" +version = "2.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +checksum = "46f71c0377baf4ef1cc3e3402ded576dccc315800fbc62dfc7fe04b009773b4a" dependencies = [ "proc-macro2", "quote", @@ -3460,7 +3492,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -3471,12 +3503,13 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.14.0" +version = "3.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" +checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" dependencies = [ "cfg-if", "fastrand", + "getrandom", "once_cell", "rustix", "windows-sys 0.59.0", @@ -3493,11 +3526,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.3" +version = "2.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +checksum = "a3ac7f54ca534db81081ef1c1e7f6ea8a3ef428d2fc069097c079443d24124d3" dependencies = [ - "thiserror-impl 2.0.3", + "thiserror-impl 2.0.10", ] [[package]] @@ -3508,18 +3541,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] name = "thiserror-impl" -version = "2.0.3" +version = "2.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" +checksum = "9e9465d30713b56a37ede7185763c3492a91be2f5fa68d958c44e41ab9248beb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -3554,9 +3587,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8" dependencies = [ "tinyvec_macros", ] @@ -3569,9 +3602,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.41.1" +version = "1.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" +checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" dependencies = [ "backtrace", "bytes", @@ -3585,31 +3618,30 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ "rustls", - "rustls-pki-types", "tokio", ] [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", @@ -3618,6 +3650,27 @@ dependencies = [ "tokio", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + [[package]] name = "tower-service" version = "0.3.3" @@ -3643,7 +3696,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -3694,22 +3747,22 @@ checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" [[package]] name = "typed-builder" -version = "0.16.2" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34085c17941e36627a879208083e25d357243812c30e7d7387c3b954f30ade16" +checksum = "a06fbd5b8de54c5f7c91f6fe4cebb949be2125d7758e630bb58b1d831dbce600" dependencies = [ "typed-builder-macro", ] [[package]] name = "typed-builder-macro" -version = "0.16.2" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" +checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -3734,7 +3787,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d59ab345b6c0d8ae9500b9ff334a4c7c0d316c1c628dc55726b95887eb8dbd11" dependencies = [ - "heck 0.5.0", + "heck", "log", "proc-macro2", "quote", @@ -3743,7 +3796,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.90", + "syn 2.0.95", "thiserror 1.0.69", "unicode-ident", ] @@ -3761,7 +3814,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.90", + "syn 2.0.95", "typify-impl", ] @@ -3867,9 +3920,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -3878,24 +3931,23 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.47" +version = "0.4.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dfaf8f50e5f293737ee323940c7d8b08a66a95a419223d9f41610ca08b0833d" +checksum = "38176d9b44ea84e9184eff0bc34cc167ed044f816accfe5922e54d84cf48eca2" dependencies = [ "cfg-if", "js-sys", @@ -3906,9 +3958,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3916,22 +3968,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] name = "wasm-streams" @@ -3948,9 +4000,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.74" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a98bc3c33f0fe7e59ad7cd041b89034fa82a7c2d4365ca538dda6cdaf513863c" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" dependencies = [ "js-sys", "wasm-bindgen", @@ -4159,7 +4211,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", "synstructure", ] @@ -4181,7 +4233,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", ] [[package]] @@ -4201,7 +4253,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.95", "synstructure", ] @@ -4230,16 +4282,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", -] - -[[package]] -name = "zstd" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" -dependencies = [ - "zstd-safe 6.0.6", + "syn 2.0.95", ] [[package]] @@ -4248,17 +4291,7 @@ version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ - "zstd-safe 7.2.1", -] - -[[package]] -name = "zstd-safe" -version = "6.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" -dependencies = [ - "libc", - "zstd-sys", + "zstd-safe", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index d28844685..48219414a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,11 +38,10 @@ tokio = { version = "1.41", features = ["macros", "rt", "rt-multi-thread", "sync pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]} arrow = { version = "53", features = ["pyarrow"] } -datafusion = { version = "43.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } -datafusion-substrait = { version = "43.0.0", optional = true } -datafusion-proto = { version = "43.0.0" } -datafusion-ffi = { version = "43.0.0" } -datafusion-functions-window-common = { version = "43.0.0" } +datafusion = { version = "44.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } +datafusion-substrait = { version = "44.0.0", optional = true } +datafusion-proto = { version = "44.0.0" } +datafusion-ffi = { version = "44.0.0" } prost = "0.13" # keep in line with `datafusion-substrait` uuid = { version = "1.11", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } diff --git a/README.md b/README.md index 83b307e7a..ca612c1ab 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ It is possible to configure runtime (memory and disk settings) and configuration ```python runtime = ( - RuntimeConfig() + RuntimeEnvBuilder() .with_disk_manager_os() .with_fair_spill_pool(10000000) ) diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py index 3a4399f7d..960c8ba9a 100644 --- a/benchmarks/db-benchmark/groupby-datafusion.py +++ b/benchmarks/db-benchmark/groupby-datafusion.py @@ -22,7 +22,7 @@ from datafusion import ( col, functions as f, - RuntimeConfig, + RuntimeEnvBuilder, SessionConfig, SessionContext, ) @@ -85,7 +85,9 @@ def execute(df): # create a session context with explicit runtime and config settings runtime = ( - RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(64 * 1024 * 1024 * 1024) + RuntimeEnvBuilder() + .with_disk_manager_os() + .with_fair_spill_pool(64 * 1024 * 1024 * 1024) ) config = ( SessionConfig() diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index 7f104a4cb..daa831b55 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -28,7 +28,7 @@ def bench(data_path, query_path): # create context # runtime = ( - # RuntimeConfig() + # RuntimeEnvBuilder() # .with_disk_manager_os() # .with_fair_spill_pool(10000000) # ) diff --git a/docs/source/user-guide/configuration.rst b/docs/source/user-guide/configuration.rst index 7d330019f..db200a46a 100644 --- a/docs/source/user-guide/configuration.rst +++ b/docs/source/user-guide/configuration.rst @@ -19,18 +19,18 @@ Configuration ============= Let's look at how we can configure DataFusion. When creating a :py:class:`~datafusion.context.SessionContext`, you can pass in -a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.context.RuntimeConfig` object. These two cover a wide range of options. +a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.context.RuntimeEnvBuilder` object. These two cover a wide range of options. .. code-block:: python - from datafusion import RuntimeConfig, SessionConfig, SessionContext + from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext # create a session context with default settings ctx = SessionContext() print(ctx) # create a session context with explicit runtime and config settings - runtime = RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(10000000) + runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( SessionConfig() .with_create_default_catalog_and_schema(True) @@ -48,4 +48,4 @@ a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.conte You can read more about available :py:class:`~datafusion.context.SessionConfig` options in the `rust DataFusion Configuration guide `_, -and about :code:`RuntimeConfig` options in the rust `online API documentation `_. +and about :code:`RuntimeEnvBuilder` options in the rust `online API documentation `_. diff --git a/examples/create-context.py b/examples/create-context.py index 11525d8b8..760c8513e 100644 --- a/examples/create-context.py +++ b/examples/create-context.py @@ -15,14 +15,14 @@ # specific language governing permissions and limitations # under the License. -from datafusion import RuntimeConfig, SessionConfig, SessionContext +from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext # create a session context with default settings ctx = SessionContext() print(ctx) # create a session context with explicit runtime and config settings -runtime = RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(10000000) +runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( SessionConfig() .with_create_default_catalog_and_schema(True) diff --git a/examples/ffi-table-provider/Cargo.toml b/examples/ffi-table-provider/Cargo.toml index 4e54eaf03..4e6f91f33 100644 --- a/examples/ffi-table-provider/Cargo.toml +++ b/examples/ffi-table-provider/Cargo.toml @@ -21,8 +21,8 @@ version = "0.1.0" edition = "2021" [dependencies] -datafusion = { version = "43.0.0" } -datafusion-ffi = { version = "43.0.0" } +datafusion = { version = "44.0.0" } +datafusion-ffi = { version = "44.0.0" } pyo3 = { version = "0.22.6", features = ["extension-module", "abi3", "abi3-py38"] } arrow = { version = "53.2.0" } arrow-array = { version = "53.2.0" } diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 7367b0d3b..2d8db42c8 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -29,7 +29,7 @@ from .context import ( SessionContext, SessionConfig, - RuntimeConfig, + RuntimeEnvBuilder, SQLOptions, ) @@ -66,7 +66,7 @@ "SessionContext", "SessionConfig", "SQLOptions", - "RuntimeConfig", + "RuntimeEnvBuilder", "Expr", "ScalarUDF", "WindowFrame", diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 6d7f574c4..3c284c9f9 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -20,7 +20,7 @@ from __future__ import annotations from ._internal import SessionConfig as SessionConfigInternal -from ._internal import RuntimeConfig as RuntimeConfigInternal +from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal from ._internal import SQLOptions as SQLOptionsInternal from ._internal import SessionContext as SessionContextInternal @@ -265,39 +265,41 @@ def set(self, key: str, value: str) -> SessionConfig: return self -class RuntimeConfig: +class RuntimeEnvBuilder: """Runtime configuration options.""" def __init__(self) -> None: - """Create a new :py:class:`RuntimeConfig` with default values.""" - self.config_internal = RuntimeConfigInternal() + """Create a new :py:class:`RuntimeEnvBuilder` with default values.""" + self.config_internal = RuntimeEnvBuilderInternal() - def with_disk_manager_disabled(self) -> RuntimeConfig: + def with_disk_manager_disabled(self) -> RuntimeEnvBuilder: """Disable the disk manager, attempts to create temporary files will error. Returns: - A new :py:class:`RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeEnvBuilder` object with the updated setting. """ self.config_internal = self.config_internal.with_disk_manager_disabled() return self - def with_disk_manager_os(self) -> RuntimeConfig: + def with_disk_manager_os(self) -> RuntimeEnvBuilder: """Use the operating system's temporary directory for disk manager. Returns: - A new :py:class:`RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeEnvBuilder` object with the updated setting. """ self.config_internal = self.config_internal.with_disk_manager_os() return self - def with_disk_manager_specified(self, *paths: str | pathlib.Path) -> RuntimeConfig: + def with_disk_manager_specified( + self, *paths: str | pathlib.Path + ) -> RuntimeEnvBuilder: """Use the specified paths for the disk manager's temporary files. Args: paths: Paths to use for the disk manager's temporary files. Returns: - A new :py:class:`RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeEnvBuilder` object with the updated setting. """ paths_list = [str(p) for p in paths] self.config_internal = self.config_internal.with_disk_manager_specified( @@ -305,16 +307,16 @@ def with_disk_manager_specified(self, *paths: str | pathlib.Path) -> RuntimeConf ) return self - def with_unbounded_memory_pool(self) -> RuntimeConfig: + def with_unbounded_memory_pool(self) -> RuntimeEnvBuilder: """Use an unbounded memory pool. Returns: - A new :py:class:`RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeEnvBuilder` object with the updated setting. """ self.config_internal = self.config_internal.with_unbounded_memory_pool() return self - def with_fair_spill_pool(self, size: int) -> RuntimeConfig: + def with_fair_spill_pool(self, size: int) -> RuntimeEnvBuilder: """Use a fair spill pool with the specified size. This pool works best when you know beforehand the query has multiple spillable @@ -335,16 +337,16 @@ def with_fair_spill_pool(self, size: int) -> RuntimeConfig: size: Size of the memory pool in bytes. Returns: - A new :py:class:`RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeEnvBuilder` object with the updated setting. Examples usage:: - config = RuntimeConfig().with_fair_spill_pool(1024) + config = RuntimeEnvBuilder().with_fair_spill_pool(1024) """ self.config_internal = self.config_internal.with_fair_spill_pool(size) return self - def with_greedy_memory_pool(self, size: int) -> RuntimeConfig: + def with_greedy_memory_pool(self, size: int) -> RuntimeEnvBuilder: """Use a greedy memory pool with the specified size. This pool works well for queries that do not need to spill or have a single @@ -355,32 +357,39 @@ def with_greedy_memory_pool(self, size: int) -> RuntimeConfig: size: Size of the memory pool in bytes. Returns: - A new :py:class:`RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeEnvBuilder` object with the updated setting. Example usage:: - config = RuntimeConfig().with_greedy_memory_pool(1024) + config = RuntimeEnvBuilder().with_greedy_memory_pool(1024) """ self.config_internal = self.config_internal.with_greedy_memory_pool(size) return self - def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeConfig: + def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder: """Use the specified path to create any needed temporary files. Args: path: Path to use for temporary files. Returns: - A new :py:class:`RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeEnvBuilder` object with the updated setting. Example usage:: - config = RuntimeConfig().with_temp_file_path("/tmp") + config = RuntimeEnvBuilder().with_temp_file_path("/tmp") """ self.config_internal = self.config_internal.with_temp_file_path(str(path)) return self +@deprecated("Use `RuntimeEnvBuilder` instead.") +class RuntimeConfig(RuntimeEnvBuilder): + """See `RuntimeEnvBuilder`.""" + + pass + + class SQLOptions: """Options to be used when performing SQL queries.""" @@ -454,7 +463,9 @@ class SessionContext: """ def __init__( - self, config: SessionConfig | None = None, runtime: RuntimeConfig | None = None + self, + config: SessionConfig | None = None, + runtime: RuntimeEnvBuilder | None = None, ) -> None: """Main interface for executing queries with DataFusion. diff --git a/python/tests/test_context.py b/python/tests/test_context.py index ab86faa9d..10e8ad0e9 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -25,7 +25,7 @@ from datafusion import ( DataFrame, - RuntimeConfig, + RuntimeEnvBuilder, SessionConfig, SessionContext, SQLOptions, @@ -43,7 +43,7 @@ def test_create_context_session_config_only(): def test_create_context_runtime_config_only(): - SessionContext(runtime=RuntimeConfig()) + SessionContext(runtime=RuntimeEnvBuilder()) @pytest.mark.parametrize("path_to_str", (True, False)) @@ -54,7 +54,7 @@ def test_runtime_configs(tmp_path, path_to_str): path1 = str(path1) if path_to_str else path1 path2 = str(path2) if path_to_str else path2 - runtime = RuntimeConfig().with_disk_manager_specified(path1, path2) + runtime = RuntimeEnvBuilder().with_disk_manager_specified(path1, path2) config = SessionConfig().with_default_catalog_and_schema("foo", "bar") ctx = SessionContext(config, runtime) assert ctx is not None @@ -67,7 +67,7 @@ def test_runtime_configs(tmp_path, path_to_str): def test_temporary_files(tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path - runtime = RuntimeConfig().with_temp_file_path(path) + runtime = RuntimeEnvBuilder().with_temp_file_path(path) config = SessionConfig().with_default_catalog_and_schema("foo", "bar") ctx = SessionContext(config, runtime) assert ctx is not None @@ -77,7 +77,7 @@ def test_temporary_files(tmp_path, path_to_str): def test_create_context_with_all_valid_args(): - runtime = RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(10000000) + runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( SessionConfig() .with_create_default_catalog_and_schema(True) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 5dce188ed..01c6c9cef 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -103,8 +103,11 @@ def test_lit_arith(df): result = df.collect() assert len(result) == 1 result = result[0] + assert result.column(0) == pa.array([5, 6, 7]) - assert result.column(1) == pa.array(["Hello!", "World!", "!!"]) + assert result.column(1) == pa.array( + ["Hello!", "World!", "!!"], type=pa.string_view() + ) def test_math_functions(): @@ -661,9 +664,12 @@ def test_array_function_obj_tests(stmt, py_expr): ), ( f.concat(column("a").cast(pa.string()), literal("?")), - pa.array(["Hello?", "World?", "!?"]), + pa.array(["Hello?", "World?", "!?"], type=pa.string_view()), + ), + ( + f.initcap(column("c")), + pa.array(["Hello ", " World ", " !"], type=pa.string_view()), ), - (f.initcap(column("c")), pa.array(["Hello ", " World ", " !"])), (f.left(column("a"), literal(3)), pa.array(["Hel", "Wor", "!"])), (f.length(column("c")), pa.array([6, 7, 2], type=pa.int32())), (f.lower(column("a")), pa.array(["hello", "world", "!"])), @@ -871,8 +877,8 @@ def test_temporal_functions(df): result = df.collect() assert len(result) == 1 result = result[0] - assert result.column(0) == pa.array([12, 6, 7], type=pa.float64()) - assert result.column(1) == pa.array([2022, 2027, 2020], type=pa.float64()) + assert result.column(0) == pa.array([12, 6, 7], type=pa.int32()) + assert result.column(1) == pa.array([2022, 2027, 2020], type=pa.int32()) assert result.column(2) == pa.array( [datetime(2022, 12, 1), datetime(2027, 6, 1), datetime(2020, 7, 1)], type=pa.timestamp("us"), @@ -904,7 +910,7 @@ def test_temporal_functions(df): assert result.column(9) == pa.array( [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") ) - assert result.column(10) == pa.array([31, 26, 2], type=pa.float64()) + assert result.column(10) == pa.array([31, 26, 2], type=pa.int32()) def test_arrow_cast(df): diff --git a/src/context.rs b/src/context.rs index 88c90e0fd..bab7fd42a 100644 --- a/src/context.rs +++ b/src/context.rs @@ -62,7 +62,7 @@ use datafusion::execution::context::{ use datafusion::execution::disk_manager::DiskManagerConfig; use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool, UnboundedMemoryPool}; use datafusion::execution::options::ReadOptions; -use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, @@ -165,62 +165,62 @@ impl PySessionConfig { } /// Runtime options for a SessionContext -#[pyclass(name = "RuntimeConfig", module = "datafusion", subclass)] +#[pyclass(name = "RuntimeEnvBuilder", module = "datafusion", subclass)] #[derive(Clone)] -pub struct PyRuntimeConfig { - pub config: RuntimeConfig, +pub struct PyRuntimeEnvBuilder { + pub builder: RuntimeEnvBuilder, } #[pymethods] -impl PyRuntimeConfig { +impl PyRuntimeEnvBuilder { #[new] fn new() -> Self { Self { - config: RuntimeConfig::default(), + builder: RuntimeEnvBuilder::default(), } } fn with_disk_manager_disabled(&self) -> Self { - let config = self.config.clone(); - let config = config.with_disk_manager(DiskManagerConfig::Disabled); - Self { config } + let mut builder = self.builder.clone(); + builder = builder.with_disk_manager(DiskManagerConfig::Disabled); + Self { builder } } fn with_disk_manager_os(&self) -> Self { - let config = self.config.clone(); - let config = config.with_disk_manager(DiskManagerConfig::NewOs); - Self { config } + let builder = self.builder.clone(); + let builder = builder.with_disk_manager(DiskManagerConfig::NewOs); + Self { builder } } fn with_disk_manager_specified(&self, paths: Vec) -> Self { - let config = self.config.clone(); + let builder = self.builder.clone(); let paths = paths.iter().map(|s| s.into()).collect(); - let config = config.with_disk_manager(DiskManagerConfig::NewSpecified(paths)); - Self { config } + let builder = builder.with_disk_manager(DiskManagerConfig::NewSpecified(paths)); + Self { builder } } fn with_unbounded_memory_pool(&self) -> Self { - let config = self.config.clone(); - let config = config.with_memory_pool(Arc::new(UnboundedMemoryPool::default())); - Self { config } + let builder = self.builder.clone(); + let builder = builder.with_memory_pool(Arc::new(UnboundedMemoryPool::default())); + Self { builder } } fn with_fair_spill_pool(&self, size: usize) -> Self { - let config = self.config.clone(); - let config = config.with_memory_pool(Arc::new(FairSpillPool::new(size))); - Self { config } + let builder = self.builder.clone(); + let builder = builder.with_memory_pool(Arc::new(FairSpillPool::new(size))); + Self { builder } } fn with_greedy_memory_pool(&self, size: usize) -> Self { - let config = self.config.clone(); - let config = config.with_memory_pool(Arc::new(GreedyMemoryPool::new(size))); - Self { config } + let builder = self.builder.clone(); + let builder = builder.with_memory_pool(Arc::new(GreedyMemoryPool::new(size))); + Self { builder } } fn with_temp_file_path(&self, path: &str) -> Self { - let config = self.config.clone(); - let config = config.with_temp_file_path(path); - Self { config } + let builder = self.builder.clone(); + let builder = builder.with_temp_file_path(path); + Self { builder } } } @@ -276,19 +276,19 @@ impl PySessionContext { #[new] pub fn new( config: Option, - runtime: Option, + runtime: Option, ) -> PyResult { let config = if let Some(c) = config { c.config } else { SessionConfig::default().with_information_schema(true) }; - let runtime_config = if let Some(c) = runtime { - c.config + let runtime_env_builder = if let Some(c) = runtime { + c.builder } else { - RuntimeConfig::default() + RuntimeEnvBuilder::default() }; - let runtime = Arc::new(RuntimeEnv::try_new(runtime_config)?); + let runtime = Arc::new(runtime_env_builder.build()?); let session_state = SessionStateBuilder::new() .with_config(config) .with_runtime_env(runtime) diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index 2759aa678..9d2559429 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; /// Implements a Datafusion physical ExecutionPlan that delegates to a PyArrow Dataset /// This actually performs the projection, filtering and scanning of a Dataset use pyo3::prelude::*; @@ -34,11 +35,11 @@ use datafusion::error::{DataFusionError as InnerDataFusionError, Result as DFRes use datafusion::execution::context::TaskContext; use datafusion::logical_expr::utils::conjunction; use datafusion::logical_expr::Expr; -use datafusion::physical_expr::{EquivalenceProperties, PhysicalSortExpr}; +use datafusion::physical_expr::{EquivalenceProperties, LexOrdering}; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, ExecutionPlanProperties, - Partitioning, SendableRecordBatchStream, Statistics, + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning, + SendableRecordBatchStream, Statistics, }; use crate::errors::DataFusionError; @@ -136,7 +137,8 @@ impl DatasetExec { let plan_properties = datafusion::physical_plan::PlanProperties::new( EquivalenceProperties::new(schema.clone()), Partitioning::UnknownPartitioning(fragments.len()), - ExecutionMode::Bounded, + EmissionType::Final, + Boundedness::Bounded, ); Ok(DatasetExec { @@ -251,12 +253,16 @@ impl ExecutionPlanProperties for DatasetExec { self.plan_properties.output_partitioning() } - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + fn output_ordering(&self) -> Option<&LexOrdering> { None } - fn execution_mode(&self) -> datafusion::physical_plan::ExecutionMode { - self.plan_properties.execution_mode + fn boundedness(&self) -> Boundedness { + self.plan_properties.boundedness + } + + fn pipeline_behavior(&self) -> EmissionType { + self.plan_properties.emission_type } fn equivalence_properties(&self) -> &datafusion::physical_expr::EquivalenceProperties { diff --git a/src/functions.rs b/src/functions.rs index ccc1981bd..ae032d702 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -36,10 +36,7 @@ use datafusion::functions_aggregate; use datafusion::functions_window; use datafusion::logical_expr::expr::Alias; use datafusion::logical_expr::sqlparser::ast::NullTreatment as DFNullTreatment; -use datafusion::logical_expr::{ - expr::{find_df_window_func, WindowFunction}, - lit, Expr, WindowFunctionDefinition, -}; +use datafusion::logical_expr::{expr::WindowFunction, lit, Expr, WindowFunctionDefinition}; fn add_builder_fns_to_aggregate( agg_fn: Expr, @@ -232,12 +229,6 @@ fn when(when: PyExpr, then: PyExpr) -> PyResult { /// /// NOTE: we search the built-ins first because the `UDAF` versions currently do not have the same behavior. fn find_window_fn(name: &str, ctx: Option) -> PyResult { - // search built in window functions (soon to be deprecated) - let df_window_func = find_df_window_func(name); - if let Some(df_window_func) = df_window_func { - return Ok(df_window_func); - } - if let Some(ctx) = ctx { // search UDAFs let udaf = ctx diff --git a/src/lib.rs b/src/lib.rs index 0b57e0999..1111d5d06 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -78,7 +78,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 40f0a6a65..a541889c7 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -85,12 +85,10 @@ impl PyLogicalPlan { | LogicalPlan::Union(_) | LogicalPlan::Statement(_) | LogicalPlan::Values(_) - | LogicalPlan::Prepare(_) | LogicalPlan::Dml(_) | LogicalPlan::Ddl(_) | LogicalPlan::Copy(_) | LogicalPlan::DescribeTable(_) - | LogicalPlan::Execute(_) | LogicalPlan::RecursiveQuery(_) => Err(py_unsupported_variant_err(format!( "Conversion of variant not implemented: {:?}", self.plan diff --git a/src/substrait.rs b/src/substrait.rs index f89b6b093..16e8c9507 100644 --- a/src/substrait.rs +++ b/src/substrait.rs @@ -114,7 +114,8 @@ impl PySubstraitProducer { /// Convert DataFusion LogicalPlan to Substrait Plan #[staticmethod] pub fn to_substrait_plan(plan: PyLogicalPlan, ctx: &PySessionContext) -> PyResult { - match producer::to_substrait_plan(&plan.plan, &ctx.ctx) { + let session_state = ctx.ctx.state(); + match producer::to_substrait_plan(&plan.plan, &session_state) { Ok(plan) => Ok(PyPlan { plan: *plan }), Err(e) => Err(py_datafusion_err(e)), } @@ -134,7 +135,8 @@ impl PySubstraitConsumer { plan: PyPlan, py: Python, ) -> PyResult { - let result = consumer::from_substrait_plan(&ctx.ctx, &plan.plan); + let session_state = ctx.ctx.state(); + let result = consumer::from_substrait_plan(&session_state, &plan.plan); let logical_plan = wait_for_future(py, result).map_err(DataFusionError::from)?; Ok(PyLogicalPlan::new(logical_plan)) } diff --git a/src/udwf.rs b/src/udwf.rs index 3f5ad0b1d..689eb79e3 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -22,9 +22,7 @@ use std::sync::Arc; use arrow::array::{make_array, Array, ArrayData, ArrayRef}; use datafusion::logical_expr::function::{PartitionEvaluatorArgs, WindowUDFFieldArgs}; use datafusion::logical_expr::window_state::WindowAggState; -use datafusion::physical_plan::PhysicalExpr; use datafusion::scalar::ScalarValue; -use datafusion_functions_window_common::expr::ExpressionArgs; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; @@ -319,8 +317,4 @@ impl WindowUDFImpl for MultiColumnWindowUDF { let _ = _partition_evaluator_args; (self.partition_evaluator_factory)() } - - fn expressions(&self, expr_args: ExpressionArgs) -> Vec> { - expr_args.input_exprs().into() - } } From 2d8b1d32f4941b2e02a29e9135025a32ba6ae471 Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 11 Jan 2025 10:12:04 +0800 Subject: [PATCH 088/248] Default to ZSTD compression when writing Parquet (#981) * fix: update default compression to ZSTD and improve documentation for write_parquet method * fix: clarify compression level documentation for ZSTD in write_parquet method * fix: update default compression level for ZSTD to 4 in write_parquet method * fix: improve docstring formatting for DataFrame parquet writing method * feat: implement Compression enum and update write_parquet method to use it * add test * fix: remove unused import and update default compression to ZSTD in rs' write_parquet method * fix: update compression type strings to lowercase in DataFrame parquet writing method doc * test: update parquet compression tests to validate invalid and default compression levels * add comment on source of Compression * docs: enhance Compression enum documentation and add default level method * test: include gzip in default compression level tests for write_parquet * refactor: simplify Compression enum methods and improve type handling in DataFrame.write_parquet * docs: update Compression enum methods to include return type descriptions * move comment to within test * Ruff format --------- Co-authored-by: Tim Saucer --- python/datafusion/dataframe.py | 94 +++++++++++++++++++++++++++++++--- python/tests/test_dataframe.py | 14 ++++- src/dataframe.rs | 2 +- 3 files changed, 101 insertions(+), 9 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 0b38db924..f8aef0c91 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -21,7 +21,16 @@ from __future__ import annotations import warnings -from typing import Any, Iterable, List, TYPE_CHECKING, Literal, overload +from typing import ( + Any, + Iterable, + List, + TYPE_CHECKING, + Literal, + overload, + Optional, + Union, +) from datafusion.record_batch import RecordBatchStream from typing_extensions import deprecated from datafusion.plan import LogicalPlan, ExecutionPlan @@ -35,6 +44,60 @@ from datafusion._internal import DataFrame as DataFrameInternal from datafusion.expr import Expr, SortExpr, sort_or_default +from enum import Enum + + +# excerpt from deltalake +# https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 +class Compression(Enum): + """Enum representing the available compression types for Parquet files.""" + + UNCOMPRESSED = "uncompressed" + SNAPPY = "snappy" + GZIP = "gzip" + BROTLI = "brotli" + LZ4 = "lz4" + LZ0 = "lz0" + ZSTD = "zstd" + LZ4_RAW = "lz4_raw" + + @classmethod + def from_str(cls, value: str) -> "Compression": + """Convert a string to a Compression enum value. + + Args: + value: The string representation of the compression type. + + Returns: + The Compression enum lowercase value. + + Raises: + ValueError: If the string does not match any Compression enum value. + """ + try: + return cls(value.lower()) + except ValueError: + raise ValueError( + f"{value} is not a valid Compression. Valid values are: {[item.value for item in Compression]}" + ) + + def get_default_level(self) -> Optional[int]: + """Get the default compression level for the compression type. + + Returns: + The default compression level for the compression type. + """ + # GZIP, BROTLI default values from deltalake repo + # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 + # ZSTD default value from delta-rs + # https://github.com/apache/datafusion-python/pull/981#discussion_r1904789223 + if self == Compression.GZIP: + return 6 + elif self == Compression.BROTLI: + return 1 + elif self == Compression.ZSTD: + return 4 + return None class DataFrame: @@ -620,17 +683,36 @@ def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None def write_parquet( self, path: str | pathlib.Path, - compression: str = "uncompressed", + compression: Union[str, Compression] = Compression.ZSTD, compression_level: int | None = None, ) -> None: """Execute the :py:class:`DataFrame` and write the results to a Parquet file. Args: path: Path of the Parquet file to write. - compression: Compression type to use. - compression_level: Compression level to use. - """ - self.df.write_parquet(str(path), compression, compression_level) + compression: Compression type to use. Default is "ZSTD". + Available compression types are: + - "uncompressed": No compression. + - "snappy": Snappy compression. + - "gzip": Gzip compression. + - "brotli": Brotli compression. + - "lz0": LZ0 compression. + - "lz4": LZ4 compression. + - "lz4_raw": LZ4_RAW compression. + - "zstd": Zstandard compression. + compression_level: Compression level to use. For ZSTD, the + recommended range is 1 to 22, with the default being 4. Higher levels + provide better compression but slower speed. + """ + # Convert string to Compression enum if necessary + if isinstance(compression, str): + compression = Compression.from_str(compression) + + if compression in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD}: + if compression_level is None: + compression_level = compression.get_default_level() + + self.df.write_parquet(str(path), compression.value, compression_level) def write_json(self, path: str | pathlib.Path) -> None: """Execute the :py:class:`DataFrame` and write the results to a JSON file. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e3bd1b2a5..fa5f4e8c5 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1107,14 +1107,24 @@ def test_write_compressed_parquet_wrong_compression_level( ) -@pytest.mark.parametrize("compression", ["brotli", "zstd", "wrong"]) -def test_write_compressed_parquet_missing_compression_level(df, tmp_path, compression): +@pytest.mark.parametrize("compression", ["wrong"]) +def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression): path = tmp_path with pytest.raises(ValueError): df.write_parquet(str(path), compression=compression) +@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"]) +def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression): + # Test write_parquet with zstd, brotli, gzip default compression level, + # ie don't specify compression level + # should complete without error + path = tmp_path + + df.write_parquet(str(path), compression=compression) + + def test_dataframe_export(df) -> None: # Guarantees that we have the canonical implementation # reading our dataframe export diff --git a/src/dataframe.rs b/src/dataframe.rs index fcb46a756..71a6fe60f 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -463,7 +463,7 @@ impl PyDataFrame { /// Write a `DataFrame` to a Parquet file. #[pyo3(signature = ( path, - compression="uncompressed", + compression="zstd", compression_level=None ))] fn write_parquet( From 39fec53ca1182049700806a423f60d44f2a9676d Mon Sep 17 00:00:00 2001 From: kosiew Date: Tue, 14 Jan 2025 20:01:10 +0800 Subject: [PATCH 089/248] fix: correct LZ0 to LZO in compression options (#995) * fix: correct LZ0 to LZO in compression options * fix: disable LZO compression option and update tests to reflect its unavailability * fix: ruff format expected string in test_execution_plan * fix: update test for execution plan and add validation for invalid LZO compression * fix: remove LZO compression option and related test cases * ruff autoformat * fix: remove TODO comment regarding LZO compression implementation --- python/datafusion/dataframe.py | 6 ++++-- python/tests/test_dataframe.py | 2 ++ python/tests/test_functions.py | 18 +++++++++--------- src/dataframe.rs | 2 +- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index f8aef0c91..b0c1abdad 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -57,7 +57,9 @@ class Compression(Enum): GZIP = "gzip" BROTLI = "brotli" LZ4 = "lz4" - LZ0 = "lz0" + # lzo is not implemented yet + # https://github.com/apache/arrow-rs/issues/6970 + # LZO = "lzo" ZSTD = "zstd" LZ4_RAW = "lz4_raw" @@ -696,10 +698,10 @@ def write_parquet( - "snappy": Snappy compression. - "gzip": Gzip compression. - "brotli": Brotli compression. - - "lz0": LZ0 compression. - "lz4": LZ4 compression. - "lz4_raw": LZ4_RAW compression. - "zstd": Zstandard compression. + Note: LZO is not yet implemented in arrow-rs and is therefore excluded. compression_level: Compression level to use. For ZSTD, the recommended range is 1 to 22, with the default being 4. Higher levels provide better compression but slower speed. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index fa5f4e8c5..a1a871e9a 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1115,6 +1115,8 @@ def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression) df.write_parquet(str(path), compression=compression) +# not testing lzo because it it not implemented yet +# https://github.com/apache/arrow-rs/issues/6970 @pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"]) def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression): # Test write_parquet with zstd, brotli, gzip default compression level, diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 01c6c9cef..add170c17 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -790,9 +790,9 @@ def test_hash_functions(df): ) assert result.column(2) == pa.array( [ - b("185F8DB32271FE25F561A6FC938B2E26" "4306EC304EDA518007D1764826381969"), - b("78AE647DC5544D227130A0682A51E30B" "C7777FBB6D8A8F17007463A3ECD1D524"), - b("BB7208BC9B5D7C04F1236A82A0093A5E" "33F40423D5BA8D4266F7092C3BA43B62"), + b("185F8DB32271FE25F561A6FC938B2E264306EC304EDA518007D1764826381969"), + b("78AE647DC5544D227130A0682A51E30BC7777FBB6D8A8F17007463A3ECD1D524"), + b("BB7208BC9B5D7C04F1236A82A0093A5E33F40423D5BA8D4266F7092C3BA43B62"), ] ) assert result.column(3) == pa.array( @@ -838,16 +838,16 @@ def test_hash_functions(df): ) assert result.column(5) == pa.array( [ - b("F73A5FBF881F89B814871F46E26AD3FA" "37CB2921C5E8561618639015B3CCBB71"), - b("B792A0383FB9E7A189EC150686579532" "854E44B71AC394831DAED169BA85CCC5"), - b("27988A0E51812297C77A433F63523334" "6AEE29A829DCF4F46E0F58F402C6CFCB"), + b("F73A5FBF881F89B814871F46E26AD3FA37CB2921C5E8561618639015B3CCBB71"), + b("B792A0383FB9E7A189EC150686579532854E44B71AC394831DAED169BA85CCC5"), + b("27988A0E51812297C77A433F635233346AEE29A829DCF4F46E0F58F402C6CFCB"), ] ) assert result.column(6) == pa.array( [ - b("FBC2B0516EE8744D293B980779178A35" "08850FDCFE965985782C39601B65794F"), - b("BF73D18575A736E4037D45F9E316085B" "86C19BE6363DE6AA789E13DEAACC1C4E"), - b("C8D11B9F7237E4034ADBCD2005735F9B" "C4C597C75AD89F4492BEC8F77D15F7EB"), + b("FBC2B0516EE8744D293B980779178A3508850FDCFE965985782C39601B65794F"), + b("BF73D18575A736E4037D45F9E316085B86C19BE6363DE6AA789E13DEAACC1C4E"), + b("C8D11B9F7237E4034ADBCD2005735F9BC4C597C75AD89F4492BEC8F77D15F7EB"), ] ) assert result.column(7) == result.column(1) # SHA-224 diff --git a/src/dataframe.rs b/src/dataframe.rs index 71a6fe60f..b875480a7 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -491,7 +491,7 @@ impl PyDataFrame { ZstdLevel::try_new(verify_compression_level(compression_level)? as i32) .map_err(|e| PyValueError::new_err(format!("{e}")))?, ), - "lz0" => Compression::LZO, + "lzo" => Compression::LZO, "lz4" => Compression::LZ4, "lz4_raw" => Compression::LZ4_RAW, "uncompressed" => Compression::UNCOMPRESSED, From 31fee392d41f723179678ee7fdac2719ef1d40d1 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 19 Jan 2025 07:51:14 -0500 Subject: [PATCH 090/248] Feat/use uv python management (#994) * Remove requirements files and add dependencies into pyproject.toml instead * Remove old conda files since we will use uv as our primary method for developers to set up environments * Working through CI changes to use uv instead of pip and conda * Add uv lock to exclude files * Revert "Remove old conda files since we will use uv as our primary method for developers to set up environments" This reverts commit 88aff7e39334cde8101ef6d313c6bb2bd1f3981c. * Windows workflows don't use source command * Add in extra include for ignoreing rat * Use uv commands in CI * Remove conda recipes and CI stages * Working on CI using uv * Install doc requirements * Remove caching uv * Set uv venv * Add requirements for building * Revert github action to allowed one * Call uv sync with verbose mode so users can see the build occuring in CI * Test setting specific hash on action * Test setting rust-toolchain github action with pinned version * Testing night rust toolchain against apache rejection criteria * Github action is fickle with the pattern matching * Switch all Ci to use nightly rust toolchain until infra team whitelists the stable toolchain * Speed up CI by preventing build during uv sync * Additional uv commands missing no-project option * Setting python versions of dependencies to match lowest supported python version, 3.8 * Update maturin and move to deps for dev * CI ordering was wrong and maturin needed uv option * Switch to stable toolchain * uv requires two dashes * Submodule init * change directories for unit tests * Add deps for build * Maturin build doesn't take uv as parameter * Update documentation for setting up with uv * Enable cache in CI * Update documentation to use uv * Small adjustment to CI config --- .github/workflows/build.yml | 62 +- .github/workflows/conda.yml | 107 - .github/workflows/docs.yaml | 23 +- .github/workflows/test.yaml | 59 +- .pre-commit-config.yaml | 4 +- README.md | 68 +- conda/environments/datafusion-cuda-dev.yaml | 44 - conda/environments/datafusion-dev.yaml | 41 - conda/recipes/bld.bat | 26 - conda/recipes/build.sh | 84 - conda/recipes/meta.yaml | 75 - dev/python_lint.sh | 2 +- dev/release/README.md | 27 +- dev/release/rat_exclude_files.txt | 4 +- dev/release/verify-release-candidate.sh | 6 +- docs/README.md | 32 +- docs/build.sh | 6 + docs/mdbook/src/installation.md | 53 +- docs/requirements.txt | 26 - .../source/contributor-guide/introduction.rst | 42 +- pyproject.toml | 25 +- requirements-310.txt | 195 -- requirements-311.txt | 175 -- requirements-312.txt | 184 -- requirements.in | 26 - uv.lock | 1842 +++++++++++++++++ 26 files changed, 2031 insertions(+), 1207 deletions(-) delete mode 100644 .github/workflows/conda.yml delete mode 100644 conda/environments/datafusion-cuda-dev.yaml delete mode 100644 conda/environments/datafusion-dev.yaml delete mode 100644 conda/recipes/bld.bat delete mode 100644 conda/recipes/build.sh delete mode 100644 conda/recipes/meta.yaml delete mode 100644 docs/requirements.txt delete mode 100644 requirements-310.txt delete mode 100644 requirements-311.txt delete mode 100644 requirements-312.txt delete mode 100644 requirements.in create mode 100644 uv.lock diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 084a96192..acabad3ca 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -31,28 +31,33 @@ jobs: - name: Install Python uses: actions/setup-python@v5 with: - python-version: "3.11" + python-version: "3.12" + + - uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + # Use the --no-install-package to only install the dependencies + # but do not yet build the rust library - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install ruff + run: uv sync --dev --no-install-package datafusion + # Update output format to enable automatic inline annotations. - name: Run Ruff run: | - ruff check --output-format=github python/ - ruff format --check python/ + uv run --no-project ruff check --output-format=github python/ + uv run --no-project ruff format --check python/ generate-license: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions-rs/toolchain@v1 + - uses: astral-sh/setup-uv@v5 with: - profile: minimal - toolchain: stable - override: true + enable-cache: true + - name: Generate license file - run: python ./dev/create_license.py + run: uv run --no-project python ./dev/create_license.py - uses: actions/upload-artifact@v4 with: name: python-wheel-license @@ -74,15 +79,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - - - name: Upgrade pip - run: python -m pip install --upgrade pip - - - name: Install maturin - run: pip install maturin==1.5.1 + - uses: dtolnay/rust-toolchain@stable - run: rm LICENSE.txt - name: Download LICENSE.txt @@ -97,8 +94,14 @@ jobs: version: "27.4" repo-token: ${{ secrets.GITHUB_TOKEN }} + - uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + - name: Build Python package - run: maturin build --release --strip --features substrait + run: | + uv sync --dev --no-install-package datafusion + uv run --no-project maturin build --release --strip --features substrait - name: List Windows wheels if: matrix.os == 'windows-latest' @@ -132,15 +135,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - - - name: Upgrade pip - run: python -m pip install --upgrade pip - - - name: Install maturin - run: pip install maturin==1.5.1 + - uses: dtolnay/rust-toolchain@stable - run: rm LICENSE.txt - name: Download LICENSE.txt @@ -155,9 +150,14 @@ jobs: version: "27.4" repo-token: ${{ secrets.GITHUB_TOKEN }} + - uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + - name: Build Python package run: | - maturin build --release --strip --features substrait + uv sync --dev --no-install-package datafusion + uv run --no-project maturin build --release --strip --features substrait - name: List Mac wheels run: find target/wheels/ diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml deleted file mode 100644 index c2b8fab02..000000000 --- a/.github/workflows/conda.yml +++ /dev/null @@ -1,107 +0,0 @@ -name: Build conda nightly -on: - push: - branches: - - main - pull_request: - paths: - - Cargo.toml - - Cargo.lock - - pyproject.toml - - conda/recipes/** - - .github/workflows/conda.yml - schedule: - - cron: '0 0 * * 0' - -# When this workflow is queued, automatically cancel any previous running -# or pending jobs from the same branch -concurrency: - group: conda-${{ github.head_ref }} - cancel-in-progress: true - -# Required shell entrypoint to have properly activated conda environments -defaults: - run: - shell: bash -l {0} - -jobs: - conda: - name: "Build conda nightlies (python: ${{ matrix.python }}, arch: ${{ matrix.arch }})" - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python: ["3.8", "3.9", "3.10", "3.11"] - arch: ["linux-64", "linux-aarch64"] - steps: - - name: Manage disk space - if: matrix.arch == 'linux-aarch64' - run: | - sudo mkdir -p /opt/empty_dir || true - for d in \ - /opt/ghc \ - /opt/hostedtoolcache \ - /usr/lib/jvm \ - /usr/local/.ghcup \ - /usr/local/lib/android \ - /usr/local/share/powershell \ - /usr/share/dotnet \ - /usr/share/swift \ - ; do - sudo rsync --stats -a --delete /opt/empty_dir/ $d || true - done - sudo apt-get purge -y -f firefox \ - google-chrome-stable \ - microsoft-edge-stable - sudo apt-get autoremove -y >& /dev/null - sudo apt-get autoclean -y >& /dev/null - sudo docker image prune --all --force - df -h - - name: Create swapfile - if: matrix.arch == 'linux-aarch64' - run: | - sudo fallocate -l 10GiB /swapfile || true - sudo chmod 600 /swapfile || true - sudo mkswap /swapfile || true - sudo swapon /swapfile || true - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Set up Python - uses: conda-incubator/setup-miniconda@v3.0.4 - with: - miniforge-variant: Miniforge3 - python-version: "3.8" - channel-priority: strict - - name: Install dependencies - run: | - conda install -c conda-forge conda-build conda-verify - - which python - pip list - conda list - # Clean the conda cache - - name: Clean Conda Cache - run: conda clean --all --yes - - name: Build conda packages - run: | - # suffix for nightly package versions - export VERSION_SUFFIX=a`date +%y%m%d` - - conda build conda/recipes \ - --python ${{ matrix.python }} \ - --variants "{target_platform: [${{ matrix.arch }}]}" \ - --error-overlinking \ - --no-test \ - --no-anaconda-upload \ - --output-folder packages - - name: Test conda packages - if: matrix.arch == 'linux-64' # can only test native platform packages - run: | - conda build --test packages/${{ matrix.arch }}/*.tar.bz2 - - name: Upload conda packages as artifacts - uses: actions/upload-artifact@v4 - with: - name: "conda nightlies (python - ${{ matrix.python }}, arch - ${{ matrix.arch }})" - # need to install all conda channel metadata to properly install locally - path: packages/ diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 86288e2d8..9037e0a5c 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -57,27 +57,24 @@ jobs: version: '27.4' repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: Install dependencies - run: | - set -x - python3 -m venv venv - source venv/bin/activate - pip install -r requirements-311.txt - pip install -r docs/requirements.txt - - name: Build Datafusion + - name: Install dependencies and build + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Build repo run: | - set -x - source venv/bin/activate - maturin develop + uv venv + uv sync --dev --no-install-package datafusion --group docs + uv run --no-project maturin develop --uv - name: Build docs run: | set -x - source venv/bin/activate cd docs curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet - make html + uv run --no-project make html - name: Copy & push the generated HTML if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 21faedecd..c93d4c06f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -43,11 +43,10 @@ jobs: - uses: actions/checkout@v4 - name: Setup Rust Toolchain - uses: actions-rs/toolchain@v1 + uses: dtolnay/rust-toolchain@stable id: rust-toolchain with: - toolchain: ${{ matrix.toolchain }} - override: true + components: clippy,rustfmt - name: Install Protoc uses: arduino/setup-protoc@v3 @@ -64,60 +63,35 @@ jobs: uses: actions/cache@v4 with: path: ~/.cargo - key: cargo-cache-${{ steps.rust-toolchain.outputs.rustc_hash }}-${{ hashFiles('Cargo.lock') }} + key: cargo-cache-${{ steps.rust-toolchain.outputs.cachekey }}-${{ hashFiles('Cargo.lock') }} - name: Check Formatting - uses: actions-rs/cargo@v1 if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - with: - command: fmt - args: -- --check + run: cargo fmt -- --check - name: Run Clippy - uses: actions-rs/cargo@v1 if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - with: - command: clippy - args: --all-targets --all-features -- -D clippy::all -A clippy::redundant_closure - - - name: Create Virtualenv (3.12) - if: ${{ matrix.python-version == '3.12' }} - run: | - python -m venv venv - source venv/bin/activate - pip install -r requirements-312.txt + run: cargo clippy --all-targets --all-features -- -D clippy::all -A clippy::redundant_closure - - name: Create Virtualenv (3.10) - if: ${{ matrix.python-version == '3.10' }} - run: | - python -m venv venv - source venv/bin/activate - pip install -r requirements-310.txt - - - name: Create Virtualenv (3.11) - if: ${{ matrix.python-version == '3.11' }} - run: | - python -m venv venv - source venv/bin/activate - pip install -r requirements-311.txt + - name: Install dependencies and build + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true - name: Run tests env: RUST_BACKTRACE: 1 run: | git submodule update --init - source venv/bin/activate - pip install -e . -vv - pytest -v . + uv sync --dev --no-install-package datafusion + uv run --no-project maturin develop --uv + uv run --no-project pytest -v . - name: FFI unit tests run: | - source venv/bin/activate - pip install -e . -vv - pip install maturin==1.5.1 cd examples/ffi-table-provider - maturin develop --release --strip - pytest python/tests/_test_table_provider.py + uv run --no-project maturin develop --uv + uv run --no-project pytest python/tests/_test_table_provider.py - name: Cache the generated dataset id: cache-tpch-dataset @@ -134,7 +108,6 @@ jobs: - name: Run TPC-H examples run: | - source venv/bin/activate cd examples/tpch - python convert_data_to_parquet.py - pytest _tests.py + uv run --no-project python convert_data_to_parquet.py + uv run --no-project pytest _tests.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8509fae2c..e20fedf5c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,9 +17,9 @@ repos: - repo: https://github.com/rhysd/actionlint - rev: v1.6.23 + rev: v1.7.6 hooks: - - id: actionlint-docker + - id: actionlint-docker - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.3.0 diff --git a/README.md b/README.md index ca612c1ab..5aaf7f5f3 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,13 @@ See [examples](examples/README.md) for more information. - [Serialize query plans using Substrait](https://github.com/apache/datafusion-python/blob/main/examples/substrait.py) -## How to install (from pip) +## How to install + +### uv + +```bash +uv add datafusion +``` ### Pip @@ -164,61 +170,69 @@ You can verify the installation by running: ## How to develop -This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). +This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). The Maturin tools used in this workflow can be installed either via `uv` or `pip`. Both approaches should offer the same experience. It is recommended to use `uv` since it has significant performance improvements +over `pip`. -The Maturin tools used in this workflow can be installed either via Conda or Pip. Both approaches should offer the same experience. Multiple approaches are only offered to appease developer preference. Bootstrapping for both Conda and Pip are as follows. +Bootstrap (`uv`): -Bootstrap (Conda): +By default `uv` will attempt to build the datafusion python package. For our development we prefer to build manually. This means +that when creating your virtual environment using `uv sync` you need to pass in the additional `--no-install-package datafusion` +and for `uv run` commands the additional parameter `--no-project` ```bash # fetch this repo git clone git@github.com:apache/datafusion-python.git -# create the conda environment for dev -conda env create -f ./conda/environments/datafusion-dev.yaml -n datafusion-dev -# activate the conda environment -conda activate datafusion-dev +# create the virtual enviornment +uv sync --dev --no-install-package datafusion +# activate the environment +source .venv/bin/activate ``` -Or alternatively, if you are on an OS that supports CUDA Toolkit, you can use `-f ./conda/environments/datafusion-cuda-dev.yaml`. - -Bootstrap (Pip): +Bootstrap (`pip`): ```bash # fetch this repo git clone git@github.com:apache/datafusion-python.git # prepare development environment (used to build wheel / install in development) -python3 -m venv venv +python3 -m venv .venv # activate the venv -source venv/bin/activate +source .venv/bin/activate # update pip itself if necessary python -m pip install -U pip -# install dependencies (for Python 3.8+) -python -m pip install -r requirements.in +# install dependencies +python -m pip install -r pyproject.toml ``` The tests rely on test data in git submodules. ```bash -git submodule init -git submodule update +git submodule update --init ``` Whenever rust code changes (your changes or via `git pull`): ```bash # make sure you activate the venv using "source venv/bin/activate" first -maturin develop +maturin develop --uv python -m pytest ``` +Alternatively if you are using `uv` you can do the following without +needing to activate the virtual environment: + +```bash +uv run --no-project maturin develop --uv +uv --no-project pytest . +``` + ### Running & Installing pre-commit hooks -arrow-datafusion-python takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce +`datafusion-python` takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the developer but certainly helpful for keeping PRs clean and concise. Our pre-commit hooks can be installed by running `pre-commit install`, which will install the configurations in -your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete +your DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete the commit if an offending lint is found allowing you to make changes locally before pushing. The pre-commit hooks can also be run adhoc without installing them by simply running `pre-commit run --all-files` @@ -236,18 +250,8 @@ There are scripts in `ci/scripts` for running Rust and Python linters. ## How to update dependencies -To change test dependencies, change the `requirements.in` and run +To change test dependencies, change the `pyproject.toml` and run ```bash -# install pip-tools (this can be done only once), also consider running in venv -python -m pip install pip-tools -python -m piptools compile --generate-hashes -o requirements-310.txt +uv sync --dev --no-install-package datafusion ``` - -To update dependencies, run with `-U` - -```bash -python -m piptools compile -U --generate-hashes -o requirements-310.txt -``` - -More details [here](https://github.com/jazzband/pip-tools) diff --git a/conda/environments/datafusion-cuda-dev.yaml b/conda/environments/datafusion-cuda-dev.yaml deleted file mode 100644 index 1f6f23942..000000000 --- a/conda/environments/datafusion-cuda-dev.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -channels: - - conda-forge -dependencies: - - black - - flake8 - - isort - - maturin>=1.5.1 - - mypy - - numpy - - pyarrow>=11.0.0 - - pytest - - toml - - importlib_metadata - - python>=3.10 - # Packages useful for building distributions and releasing - - mamba - - conda-build - - anaconda-client - # Packages for documentation building - - sphinx - - pydata-sphinx-theme==0.8.0 - - myst-parser - - jinja2 - # GPU packages - - cudf - - cudatoolkit=11.8 -name: datafusion-dev diff --git a/conda/environments/datafusion-dev.yaml b/conda/environments/datafusion-dev.yaml deleted file mode 100644 index b4b503dc6..000000000 --- a/conda/environments/datafusion-dev.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -channels: - - conda-forge -dependencies: - - black - - flake8 - - isort - - maturin>=1.5.1 - - mypy - - numpy - - pyarrow>=11.0.0 - - pytest - - toml - - importlib_metadata - - python>=3.10 - # Packages useful for building distributions and releasing - - mamba - - conda-build - - anaconda-client - # Packages for documentation building - - sphinx - - pydata-sphinx-theme==0.8.0 - - myst-parser - - jinja2 -name: datafusion-dev diff --git a/conda/recipes/bld.bat b/conda/recipes/bld.bat deleted file mode 100644 index 90626a637..000000000 --- a/conda/recipes/bld.bat +++ /dev/null @@ -1,26 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -maturin build -vv -j %CPU_COUNT% --release --strip --features substrait --manylinux off --interpreter=%PYTHON% - -FOR /F "delims=" %%i IN ('dir /s /b target\wheels\*.whl') DO set datafusion_wheel=%%i - -%PYTHON% -m pip install --no-deps %datafusion_wheel% -vv - -cargo-bundle-licenses --format yaml --output THIRDPARTY.yml diff --git a/conda/recipes/build.sh b/conda/recipes/build.sh deleted file mode 100644 index 259894313..000000000 --- a/conda/recipes/build.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -set -ex - -# See https://github.com/conda-forge/rust-feedstock/blob/master/recipe/build.sh for cc env explanation -if [ "$c_compiler" = gcc ] ; then - case "$target_platform" in - linux-64) rust_env_arch=X86_64_UNKNOWN_LINUX_GNU ;; - linux-aarch64) rust_env_arch=AARCH64_UNKNOWN_LINUX_GNU ;; - linux-ppc64le) rust_env_arch=POWERPC64LE_UNKNOWN_LINUX_GNU ;; - *) echo "unknown target_platform $target_platform" ; exit 1 ;; - esac - - export CARGO_TARGET_${rust_env_arch}_LINKER=$CC -fi - -declare -a _xtra_maturin_args - -mkdir -p $SRC_DIR/.cargo - -if [ "$target_platform" = "osx-64" ] ; then - cat <> $SRC_DIR/.cargo/config -[target.x86_64-apple-darwin] -linker = "$CC" -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] - -EOF - - _xtra_maturin_args+=(--target=x86_64-apple-darwin) - -elif [ "$target_platform" = "osx-arm64" ] ; then - cat <> $SRC_DIR/.cargo/config -# Required for intermediate codegen stuff -[target.x86_64-apple-darwin] -linker = "$CC_FOR_BUILD" - -# Required for final binary artifacts for target -[target.aarch64-apple-darwin] -linker = "$CC" -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] - -EOF - _xtra_maturin_args+=(--target=aarch64-apple-darwin) - - # This variable must be set to the directory containing the target's libpython DSO - export PYO3_CROSS_LIB_DIR=$PREFIX/lib - - # xref: https://github.com/PyO3/pyo3/commit/7beb2720 - export PYO3_PYTHON_VERSION=${PY_VER} - - # xref: https://github.com/conda-forge/python-feedstock/issues/621 - sed -i.bak 's,aarch64,arm64,g' $BUILD_PREFIX/venv/lib/os-patch.py - sed -i.bak 's,aarch64,arm64,g' $BUILD_PREFIX/venv/lib/platform-patch.py -fi - -maturin build -vv -j "${CPU_COUNT}" --release --strip --features substrait --manylinux off --interpreter="${PYTHON}" "${_xtra_maturin_args[@]}" - -"${PYTHON}" -m pip install $SRC_DIR/target/wheels/datafusion*.whl --no-deps -vv - -cargo-bundle-licenses --format yaml --output THIRDPARTY.yml diff --git a/conda/recipes/meta.yaml b/conda/recipes/meta.yaml deleted file mode 100644 index b0784253a..000000000 --- a/conda/recipes/meta.yaml +++ /dev/null @@ -1,75 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% set name = "datafusion" %} -{% set major_minor_patch = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').split('.') %} -{% set new_patch = major_minor_patch[2] | int + 1 %} -{% set version = (major_minor_patch[:2] + [new_patch]) | join('.') + environ.get('VERSION_SUFFIX', '') %} - - -package: - name: {{ name|lower }} - version: {{ version }} - -source: - git_url: ../.. - -build: - number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ python | replace(".", "") }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - -requirements: - build: - - python # [build_platform != target_platform] - - cross-python_{{ target_platform }} # [build_platform != target_platform] - - zlib # [build_platform != target_platform] - - {{ compiler('c') }} - - {{ compiler('rust') }} - - cargo-bundle-licenses - - maturin >=1.5.1,<1.6.0 - - libprotobuf =3 - host: - - python - - maturin >=1.5.1,<1.6.0 - - pip - - zlib - - xz # [linux64] - run: - - python - - pyarrow >=11.0.0 - - typing_extensions - -test: - imports: - - datafusion - commands: - - pip check - requires: - - pip - -about: - home: https://arrow.apache.org/datafusion - license: Apache-2.0 - license_family: APACHE - license_file: - - LICENSE.txt - - THIRDPARTY.yml - description: | - DataFusion is an extensible query execution framework, written in Rust, - that uses Apache Arrow as its in-memory format. - doc_url: https://arrow.apache.org/datafusion - dev_url: https://github.com/apache/arrow-datafusion diff --git a/dev/python_lint.sh b/dev/python_lint.sh index 29f0d4833..2d867f29d 100755 --- a/dev/python_lint.sh +++ b/dev/python_lint.sh @@ -21,6 +21,6 @@ # DataFusion CI does set -e -source venv/bin/activate +source .venv/bin/activate flake8 --exclude venv,benchmarks/db-benchmark --ignore=E501,W503 black --line-length 79 . diff --git a/dev/release/README.md b/dev/release/README.md index b2c015e1d..f0b333999 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -172,8 +172,8 @@ git checkout 40.0.0-rc1 git submodule update --init --recursive # create the env -python3 -m venv venv -source venv/bin/activate +python3 -m venv .venv +source .venv/bin/activate # install release candidate pip install --extra-index-url https://test.pypi.org/simple/ datafusion==40.0.0 @@ -218,28 +218,9 @@ uploading them using `twine`: twine upload --repository pypi dist-release/* ``` -### Publish Python Artifacts to Anaconda +### Publish Python Artifacts to conda-forge -Publishing artifacts to Anaconda is similar to PyPi. First, Download the source tarball created in the previous step and untar it. - -```bash -# Assuming you have an existing conda environment named `datafusion-dev` if not see root README for instructions -conda activate datafusion-dev -conda build . -``` - -This will setup a virtual conda environment and build the artifacts inside of that virtual env. This step can take a few minutes as the entire build, host, and runtime environments are setup. Once complete a local filesystem path will be emitted for the location of the resulting package. Observe that path and copy to your clipboard. - -Ex: `/home/conda/envs/datafusion/conda-bld/linux-64/datafusion-0.7.0.tar.bz2` - -Now you are ready to publish this resulting package to anaconda.org. This can be accomplished in a few simple steps. - -```bash -# First login to Anaconda with the datafusion credentials -anaconda login -# Upload the package -anaconda upload /home/conda/envs/datafusion/conda-bld/linux-64/datafusion-0.7.0.tar.bz2 -``` +Pypi packages auto upload to conda-forge via [datafusion feedstock](https://github.com/conda-forge/datafusion-feedstock) ### Push the Release Tag diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index f65ddd06e..dcd5d9aac 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -45,4 +45,6 @@ Cargo.lock .github/* benchmarks/tpch/queries/q*.sql benchmarks/tpch/create_tables.sql -.cargo/config.toml \ No newline at end of file +.cargo/config.toml +**/.cargo/config.toml +uv.lock \ No newline at end of file diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 3879a267f..1a9104b55 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -106,7 +106,7 @@ setup_tempdir() { } test_source_distribution() { - # install rust toolchain in a similar fashion like test-miniconda + # install rust toolchain export RUSTUP_HOME=$PWD/test-rustup export CARGO_HOME=$PWD/test-rustup @@ -125,8 +125,8 @@ test_source_distribution() { git clone https://github.com/apache/arrow-testing.git testing git clone https://github.com/apache/parquet-testing.git parquet-testing - python3 -m venv venv - source venv/bin/activate + python3 -m venv .venv + source .venv/bin/activate python3 -m pip install -U pip python3 -m pip install -r requirements-310.txt maturin develop diff --git a/docs/README.md b/docs/README.md index b4b94120e..2bffea9bd 100644 --- a/docs/README.md +++ b/docs/README.md @@ -26,42 +26,32 @@ when changes are merged to the main branch. ## Dependencies It's recommended to install build dependencies and build the documentation -inside a Python `venv`. +inside a Python `venv` using `uv`. To prepare building the documentation run the following on the root level of the project: -1. Set up virtual environment if it was not already created - ```bash - python3 -m venv venv - ``` -1. Activate virtual environment - ```bash - source venv/bin/activate - ``` -1. Install Datafusion's Python dependencies - ```bash - pip install -r requirements-310.txt - ``` -1. Install documentation dependencies - ```bash - pip install -r docs/requirements.txt - ``` +```bash +# Set up a virtual environment with the documentation dependencies +uv sync --dev --group docs --no-install-package datafusion +``` ## Build & Preview Run the provided script to build the HTML pages. ```bash -cd docs -./build.sh +# Build the repository +uv run --no-project maturin develop --uv +# Build the documentation +uv run --no-project docs/build.sh ``` -The HTML will be generated into a `build` directory. +The HTML will be generated into a `build` directory in `docs`. Preview the site on Linux by running this command. ```bash -firefox build/html/index.html +firefox docs/build/html/index.html ``` ## Release Process diff --git a/docs/build.sh b/docs/build.sh index 31398d195..f73330323 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -20,6 +20,10 @@ set -e +original_dir=$(pwd) +script_dir=$(dirname "$(realpath "$0")") +cd "$script_dir" || exit + if [ ! -f pokemon.csv ]; then curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv fi @@ -33,3 +37,5 @@ rm -rf temp 2> /dev/null mkdir temp cp -rf source/* temp/ make SOURCEDIR=`pwd`/temp html + +cd "$original_dir" || exit diff --git a/docs/mdbook/src/installation.md b/docs/mdbook/src/installation.md index ba00c8b80..b29f3b66b 100644 --- a/docs/mdbook/src/installation.md +++ b/docs/mdbook/src/installation.md @@ -18,44 +18,45 @@ DataFusion is easy to install, just like any other Python library. -## Using pip +## Using uv -``` bash -pip install datafusion -``` +If you do not yet have a virtual environment, create one: -## Conda & JupyterLab setup +```bash +uv venv +``` -This section explains how to install DataFusion in a conda environment with other libraries that allow for a nice Jupyter workflow. This setup is completely optional. These steps are only needed if you'd like to run DataFusion in a Jupyter notebook and have an interface like this: +You can add datafusion to your virtual environment with the usual: -![DataFusion in Jupyter](https://github.com/MrPowers/datafusion-book/raw/main/src/images/datafusion-jupyterlab.png) +```bash +uv pip install datafusion +``` -Create a conda environment with DataFusion, Jupyter, and other useful dependencies in the `datafusion-env.yml` file: +Or, to add to a project: +```bash +uv add datafusion ``` -name: datafusion-env -channels: - - conda-forge - - defaults -dependencies: - - python=3.9 - - ipykernel - - nb_conda - - jupyterlab - - jupyterlab_code_formatter - - isort - - black - - pip - - pip: - - datafusion +## Using pip + +``` bash +pip install datafusion ``` -Create the environment with `conda env create -f datafusion-env.yml`. +## uv & JupyterLab setup -Activate the environment with `conda activate datafusion-env`. +This section explains how to install DataFusion in a uv environment with other libraries that allow for a nice Jupyter workflow. This setup is completely optional. These steps are only needed if you'd like to run DataFusion in a Jupyter notebook and have an interface like this: -Run `jupyter lab` or open the [JupyterLab Desktop application](https://github.com/jupyterlab/jupyterlab-desktop) to start running DataFusion in a Jupyter notebook. +![DataFusion in Jupyter](https://github.com/MrPowers/datafusion-book/raw/main/src/images/datafusion-jupyterlab.png) + +Create a virtual environment with DataFusion, Jupyter, and other useful dependencies and start the desktop application. + +```bash +uv venv +uv pip install datafusion jupyterlab jupyterlab_code_formatter +uv run jupyter lab +``` ## Examples diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index f5cece78e..000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,26 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -sphinx -pydata-sphinx-theme==0.8.0 -myst-parser -maturin -jinja2 -ipython -pandas -pickleshare -sphinx-autoapi diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst index 4457a898f..fb98cfd1d 100644 --- a/docs/source/contributor-guide/introduction.rst +++ b/docs/source/contributor-guide/introduction.rst @@ -29,22 +29,24 @@ Doing so is a great way to help the community as well as get more familiar with How to develop -------------- -This assumes that you have rust and cargo installed. We use the workflow recommended by `pyo3 `_ and `maturin `_. +This assumes that you have rust and cargo installed. We use the workflow recommended by +`pyo3 `_ and `maturin `_. We recommend using +`uv `_ for python package management. + +By default `uv` will attempt to build the datafusion python package. For our development we prefer to build manually. This means +that when creating your virtual environment using `uv sync` you need to pass in the additional `--no-install-package datafusion` +and for `uv run` commands the additional parameter `--no-project` Bootstrap: .. code-block:: shell # fetch this repo - git clone git@github.com:apache/arrow-datafusion-python.git - # prepare development environment (used to build wheel / install in development) - python3 -m venv venv - # activate the venv - source venv/bin/activate - # update pip itself if necessary - python -m pip install -U pip - # install dependencies (for Python 3.8+) - python -m pip install -r requirements-310.txt + git clone git@github.com:apache/datafusion-python.git + # create the virtual enviornment + uv sync --dev --no-install-package datafusion + # activate the environment + source .venv/bin/activate The tests rely on test data in git submodules. @@ -58,8 +60,8 @@ Whenever rust code changes (your changes or via `git pull`): .. code-block:: shell - # make sure you activate the venv using "source venv/bin/activate" first - maturin develop + # make sure you activate the venv using "source .venv/bin/activate" first + maturin develop -uv python -m pytest Running & Installing pre-commit hooks @@ -86,20 +88,10 @@ Mostly, the ``python`` code is limited to pure wrappers with type hints and good Update Dependencies ------------------- -To change test dependencies, change the `requirements.in` and run - -.. code-block:: shell - - # install pip-tools (this can be done only once), also consider running in venv - python -m pip install pip-tools - python -m piptools compile --generate-hashes -o requirements-310.txt +To change test dependencies, change the ``pyproject.toml`` and run - -To update dependencies, run with `-U` +To update dependencies, run .. code-block:: shell - python -m piptools compile -U --generate-hashes -o requirements-310.txt - - -More details about pip-tools `here `_ + uv sync --dev --no-install-package datafusion diff --git a/pyproject.toml b/pyproject.toml index 98bda5aae..6e8acfe71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ # under the License. [build-system] -requires = ["maturin>=1.5.1,<1.6.0"] +requires = ["maturin>=1.8.1"] build-backend = "maturin" [project] @@ -24,7 +24,7 @@ name = "datafusion" description = "Build and run queries against data" readme = "README.md" license = { file = "LICENSE.txt" } -requires-python = ">=3.7" +requires-python = ">=3.8" keywords = ["datafusion", "dataframe", "rust", "query-engine"] classifiers = [ "Development Status :: 2 - Pre-Alpha", @@ -35,7 +35,6 @@ classifiers = [ "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", @@ -82,3 +81,23 @@ max-doc-length = 88 "dev/*" = ["D"] "benchmarks/*" = ["D", "F"] "docs/*" = ["D"] + +[dependency-groups] +dev = [ + "maturin>=1.8.1", + "numpy>1.24.4 ; python_full_version >= '3.10'", + "pytest>=7.4.4", + "ruff>=0.9.1", + "toml>=0.10.2", +] +docs = [ + "sphinx>=7.1.2", + "pydata-sphinx-theme==0.8.0", + "myst-parser>=3.0.1", + "jinja2>=3.1.5", + "ipython>=8.12.3", + "pandas>=2.0.3", + "pickleshare>=0.7.5", + "sphinx-autoapi>=3.4.0", + "setuptools>=75.3.0", +] \ No newline at end of file diff --git a/requirements-310.txt b/requirements-310.txt deleted file mode 100644 index d7d25f3f1..000000000 --- a/requirements-310.txt +++ /dev/null @@ -1,195 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile --generate-hashes --output-file=requirements-310.txt -# -exceptiongroup==1.2.1 \ - --hash=sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad \ - --hash=sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16 - # via pytest -iniconfig==2.0.0 \ - --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \ - --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 - # via pytest -maturin==1.6.0 \ - --hash=sha256:16ef860df20028618b5a064da06b02c1c47acba064a4d25aaf84662a459ec599 \ - --hash=sha256:337899784955934dd67b30497d1dd5fab22da89f60bb079dbaf2eaa446b97a10 \ - --hash=sha256:4e931c92037128ade49cd26dd040d9c46ad8092d8170cc44f5c3a0b4a052d576 \ - --hash=sha256:50133965e52d8b5b969381fee3fde111ae2383905cdaba7650f256e08ccddcd4 \ - --hash=sha256:a2a2436628c36d98dabd79b52256df7e12fc4fd1b122984d9373fdf918fd4609 \ - --hash=sha256:aa4eb7dca7d246b466392f21016f67ff09a9aff2305fa714ca25a2344e4639e7 \ - --hash=sha256:b955025c24c8babc808db49e0ff90db8b4b1320dcc16b14eb26132841737230d \ - --hash=sha256:bd85edcb1b8e2bcddc1b7d16ce58ce00a66aa80c422745c8ad9e132ac40d4b48 \ - --hash=sha256:c87d1a7596c42b589099adb831343a56e02373588366e4cede96cbdf8bd68f9d \ - --hash=sha256:d67ca8dc7f3b2314bd3bf83c4de52645e220ee312fd526e53acc6a735f233fad \ - --hash=sha256:d8620970bd0b6a0acb99dbd0b1c2ebb7a69909d25f6023bdff9635a39001aa51 \ - --hash=sha256:d92b045e90ed919a8a2520dda64e3f384e5e746ea51e1498cc6ac3e9e5c76054 \ - --hash=sha256:dbbbf25dc3c207b0a7bd4f3aea1df33d4f22b8508592796a6f36f4d8ed216db0 - # via -r requirements.in -mypy==1.10.0 \ - --hash=sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061 \ - --hash=sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99 \ - --hash=sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de \ - --hash=sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a \ - --hash=sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9 \ - --hash=sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec \ - --hash=sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1 \ - --hash=sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131 \ - --hash=sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f \ - --hash=sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821 \ - --hash=sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5 \ - --hash=sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee \ - --hash=sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e \ - --hash=sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746 \ - --hash=sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2 \ - --hash=sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0 \ - --hash=sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b \ - --hash=sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53 \ - --hash=sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30 \ - --hash=sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda \ - --hash=sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051 \ - --hash=sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2 \ - --hash=sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7 \ - --hash=sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee \ - --hash=sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727 \ - --hash=sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976 \ - --hash=sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4 - # via -r requirements.in -mypy-extensions==1.0.0 \ - --hash=sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d \ - --hash=sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782 - # via mypy -numpy==2.0.0 \ - --hash=sha256:04494f6ec467ccb5369d1808570ae55f6ed9b5809d7f035059000a37b8d7e86f \ - --hash=sha256:0a43f0974d501842866cc83471bdb0116ba0dffdbaac33ec05e6afed5b615238 \ - --hash=sha256:0e50842b2295ba8414c8c1d9d957083d5dfe9e16828b37de883f51fc53c4016f \ - --hash=sha256:0ec84b9ba0654f3b962802edc91424331f423dcf5d5f926676e0150789cb3d95 \ - --hash=sha256:17067d097ed036636fa79f6a869ac26df7db1ba22039d962422506640314933a \ - --hash=sha256:1cde1753efe513705a0c6d28f5884e22bdc30438bf0085c5c486cdaff40cd67a \ - --hash=sha256:1e72728e7501a450288fc8e1f9ebc73d90cfd4671ebbd631f3e7857c39bd16f2 \ - --hash=sha256:2635dbd200c2d6faf2ef9a0d04f0ecc6b13b3cad54f7c67c61155138835515d2 \ - --hash=sha256:2ce46fd0b8a0c947ae047d222f7136fc4d55538741373107574271bc00e20e8f \ - --hash=sha256:34f003cb88b1ba38cb9a9a4a3161c1604973d7f9d5552c38bc2f04f829536609 \ - --hash=sha256:354f373279768fa5a584bac997de6a6c9bc535c482592d7a813bb0c09be6c76f \ - --hash=sha256:38ecb5b0582cd125f67a629072fed6f83562d9dd04d7e03256c9829bdec027ad \ - --hash=sha256:3e8e01233d57639b2e30966c63d36fcea099d17c53bf424d77f088b0f4babd86 \ - --hash=sha256:3f6bed7f840d44c08ebdb73b1825282b801799e325bcbdfa6bc5c370e5aecc65 \ - --hash=sha256:4554eb96f0fd263041baf16cf0881b3f5dafae7a59b1049acb9540c4d57bc8cb \ - --hash=sha256:46e161722e0f619749d1cd892167039015b2c2817296104487cd03ed4a955995 \ - --hash=sha256:49d9f7d256fbc804391a7f72d4a617302b1afac1112fac19b6c6cec63fe7fe8a \ - --hash=sha256:4d2f62e55a4cd9c58c1d9a1c9edaedcd857a73cb6fda875bf79093f9d9086f85 \ - --hash=sha256:5f64641b42b2429f56ee08b4f427a4d2daf916ec59686061de751a55aafa22e4 \ - --hash=sha256:63b92c512d9dbcc37f9d81b123dec99fdb318ba38c8059afc78086fe73820275 \ - --hash=sha256:6d7696c615765091cc5093f76fd1fa069870304beaccfd58b5dcc69e55ef49c1 \ - --hash=sha256:79e843d186c8fb1b102bef3e2bc35ef81160ffef3194646a7fdd6a73c6b97196 \ - --hash=sha256:821eedb7165ead9eebdb569986968b541f9908979c2da8a4967ecac4439bae3d \ - --hash=sha256:84554fc53daa8f6abf8e8a66e076aff6ece62de68523d9f665f32d2fc50fd66e \ - --hash=sha256:8d83bb187fb647643bd56e1ae43f273c7f4dbcdf94550d7938cfc32566756514 \ - --hash=sha256:903703372d46bce88b6920a0cd86c3ad82dae2dbef157b5fc01b70ea1cfc430f \ - --hash=sha256:9416a5c2e92ace094e9f0082c5fd473502c91651fb896bc17690d6fc475128d6 \ - --hash=sha256:9a1712c015831da583b21c5bfe15e8684137097969c6d22e8316ba66b5baabe4 \ - --hash=sha256:9c27f0946a3536403efb0e1c28def1ae6730a72cd0d5878db38824855e3afc44 \ - --hash=sha256:a356364941fb0593bb899a1076b92dfa2029f6f5b8ba88a14fd0984aaf76d0df \ - --hash=sha256:a7039a136017eaa92c1848152827e1424701532ca8e8967fe480fe1569dae581 \ - --hash=sha256:acd3a644e4807e73b4e1867b769fbf1ce8c5d80e7caaef0d90dcdc640dfc9787 \ - --hash=sha256:ad0c86f3455fbd0de6c31a3056eb822fc939f81b1618f10ff3406971893b62a5 \ - --hash=sha256:b4c76e3d4c56f145d41b7b6751255feefae92edbc9a61e1758a98204200f30fc \ - --hash=sha256:b6f6a8f45d0313db07d6d1d37bd0b112f887e1369758a5419c0370ba915b3871 \ - --hash=sha256:c5a59996dc61835133b56a32ebe4ef3740ea5bc19b3983ac60cc32be5a665d54 \ - --hash=sha256:c73aafd1afca80afecb22718f8700b40ac7cab927b8abab3c3e337d70e10e5a2 \ - --hash=sha256:cee6cc0584f71adefe2c908856ccc98702baf95ff80092e4ca46061538a2ba98 \ - --hash=sha256:cef04d068f5fb0518a77857953193b6bb94809a806bd0a14983a8f12ada060c9 \ - --hash=sha256:cf5d1c9e6837f8af9f92b6bd3e86d513cdc11f60fd62185cc49ec7d1aba34864 \ - --hash=sha256:e61155fae27570692ad1d327e81c6cf27d535a5d7ef97648a17d922224b216de \ - --hash=sha256:e7f387600d424f91576af20518334df3d97bc76a300a755f9a8d6e4f5cadd289 \ - --hash=sha256:ed08d2703b5972ec736451b818c2eb9da80d66c3e84aed1deeb0c345fefe461b \ - --hash=sha256:fbd6acc766814ea6443628f4e6751d0da6593dae29c08c0b2606164db026970c \ - --hash=sha256:feff59f27338135776f6d4e2ec7aeeac5d5f7a08a83e80869121ef8164b74af9 - # via - # -r requirements.in - # pyarrow -packaging==24.1 \ - --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ - --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 - # via pytest -pluggy==1.5.0 \ - --hash=sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1 \ - --hash=sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669 - # via pytest -pyarrow==16.1.0 \ - --hash=sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a \ - --hash=sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2 \ - --hash=sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f \ - --hash=sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2 \ - --hash=sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315 \ - --hash=sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9 \ - --hash=sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b \ - --hash=sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55 \ - --hash=sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15 \ - --hash=sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e \ - --hash=sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f \ - --hash=sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c \ - --hash=sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a \ - --hash=sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa \ - --hash=sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a \ - --hash=sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd \ - --hash=sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628 \ - --hash=sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef \ - --hash=sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e \ - --hash=sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff \ - --hash=sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b \ - --hash=sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c \ - --hash=sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c \ - --hash=sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f \ - --hash=sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3 \ - --hash=sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6 \ - --hash=sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c \ - --hash=sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147 \ - --hash=sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5 \ - --hash=sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7 \ - --hash=sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710 \ - --hash=sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4 \ - --hash=sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed \ - --hash=sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848 \ - --hash=sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83 \ - --hash=sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444 - # via -r requirements.in -pytest==8.2.2 \ - --hash=sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343 \ - --hash=sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977 - # via -r requirements.in -ruff==0.4.9 \ - --hash=sha256:06b60f91bfa5514bb689b500a25ba48e897d18fea14dce14b48a0c40d1635893 \ - --hash=sha256:0e8e7b95673f22e0efd3571fb5b0cf71a5eaaa3cc8a776584f3b2cc878e46bff \ - --hash=sha256:2d45ddc6d82e1190ea737341326ecbc9a61447ba331b0a8962869fcada758505 \ - --hash=sha256:4555056049d46d8a381f746680db1c46e67ac3b00d714606304077682832998e \ - --hash=sha256:5d5460f789ccf4efd43f265a58538a2c24dbce15dbf560676e430375f20a8198 \ - --hash=sha256:673bddb893f21ab47a8334c8e0ea7fd6598ecc8e698da75bcd12a7b9d0a3206e \ - --hash=sha256:732dd550bfa5d85af8c3c6cbc47ba5b67c6aed8a89e2f011b908fc88f87649db \ - --hash=sha256:784d3ec9bd6493c3b720a0b76f741e6c2d7d44f6b2be87f5eef1ae8cc1d54c84 \ - --hash=sha256:78de3fdb95c4af084087628132336772b1c5044f6e710739d440fc0bccf4d321 \ - --hash=sha256:8064590fd1a50dcf4909c268b0e7c2498253273309ad3d97e4a752bb9df4f521 \ - --hash=sha256:88bffe9c6a454bf8529f9ab9091c99490578a593cc9f9822b7fc065ee0712a06 \ - --hash=sha256:8c1aff58c31948cc66d0b22951aa19edb5af0a3af40c936340cd32a8b1ab7438 \ - --hash=sha256:98ec2775fd2d856dc405635e5ee4ff177920f2141b8e2d9eb5bd6efd50e80317 \ - --hash=sha256:b262ed08d036ebe162123170b35703aaf9daffecb698cd367a8d585157732991 \ - --hash=sha256:e0a22c4157e53d006530c902107c7f550b9233e9706313ab57b892d7197d8e52 \ - --hash=sha256:e91175fbe48f8a2174c9aad70438fe9cb0a5732c4159b2a10a3565fea2d94cde \ - --hash=sha256:f1cb0828ac9533ba0135d148d214e284711ede33640465e706772645483427e3 - # via -r requirements.in -toml==0.10.2 \ - --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ - --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f - # via -r requirements.in -tomli==2.0.1 \ - --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ - --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f - # via - # maturin - # mypy - # pytest -typing-extensions==4.12.2 \ - --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ - --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 - # via mypy diff --git a/requirements-311.txt b/requirements-311.txt deleted file mode 100644 index 35b91133c..000000000 --- a/requirements-311.txt +++ /dev/null @@ -1,175 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.11 -# by the following command: -# -# pip-compile --generate-hashes --output-file=requirements-311.txt -# -iniconfig==2.0.0 \ - --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \ - --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 - # via pytest -maturin==1.6.0 \ - --hash=sha256:16ef860df20028618b5a064da06b02c1c47acba064a4d25aaf84662a459ec599 \ - --hash=sha256:337899784955934dd67b30497d1dd5fab22da89f60bb079dbaf2eaa446b97a10 \ - --hash=sha256:4e931c92037128ade49cd26dd040d9c46ad8092d8170cc44f5c3a0b4a052d576 \ - --hash=sha256:50133965e52d8b5b969381fee3fde111ae2383905cdaba7650f256e08ccddcd4 \ - --hash=sha256:a2a2436628c36d98dabd79b52256df7e12fc4fd1b122984d9373fdf918fd4609 \ - --hash=sha256:aa4eb7dca7d246b466392f21016f67ff09a9aff2305fa714ca25a2344e4639e7 \ - --hash=sha256:b955025c24c8babc808db49e0ff90db8b4b1320dcc16b14eb26132841737230d \ - --hash=sha256:bd85edcb1b8e2bcddc1b7d16ce58ce00a66aa80c422745c8ad9e132ac40d4b48 \ - --hash=sha256:c87d1a7596c42b589099adb831343a56e02373588366e4cede96cbdf8bd68f9d \ - --hash=sha256:d67ca8dc7f3b2314bd3bf83c4de52645e220ee312fd526e53acc6a735f233fad \ - --hash=sha256:d8620970bd0b6a0acb99dbd0b1c2ebb7a69909d25f6023bdff9635a39001aa51 \ - --hash=sha256:d92b045e90ed919a8a2520dda64e3f384e5e746ea51e1498cc6ac3e9e5c76054 \ - --hash=sha256:dbbbf25dc3c207b0a7bd4f3aea1df33d4f22b8508592796a6f36f4d8ed216db0 - # via -r requirements.in -mypy==1.10.0 \ - --hash=sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061 \ - --hash=sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99 \ - --hash=sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de \ - --hash=sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a \ - --hash=sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9 \ - --hash=sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec \ - --hash=sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1 \ - --hash=sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131 \ - --hash=sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f \ - --hash=sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821 \ - --hash=sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5 \ - --hash=sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee \ - --hash=sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e \ - --hash=sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746 \ - --hash=sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2 \ - --hash=sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0 \ - --hash=sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b \ - --hash=sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53 \ - --hash=sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30 \ - --hash=sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda \ - --hash=sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051 \ - --hash=sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2 \ - --hash=sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7 \ - --hash=sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee \ - --hash=sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727 \ - --hash=sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976 \ - --hash=sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4 - # via -r requirements.in -mypy-extensions==1.0.0 \ - --hash=sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d \ - --hash=sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782 - # via mypy -numpy==1.26.4 \ - --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ - --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ - --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \ - --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \ - --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \ - --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \ - --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \ - --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \ - --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \ - --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \ - --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \ - --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \ - --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \ - --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \ - --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \ - --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \ - --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \ - --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \ - --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \ - --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \ - --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \ - --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \ - --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \ - --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \ - --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \ - --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \ - --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \ - --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \ - --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \ - --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \ - --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \ - --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \ - --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \ - --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \ - --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \ - --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f - # via - # -r requirements.in - # pyarrow -packaging==24.0 \ - --hash=sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5 \ - --hash=sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9 - # via pytest -pluggy==1.5.0 \ - --hash=sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1 \ - --hash=sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669 - # via pytest -pyarrow==16.1.0 \ - --hash=sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a \ - --hash=sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2 \ - --hash=sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f \ - --hash=sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2 \ - --hash=sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315 \ - --hash=sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9 \ - --hash=sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b \ - --hash=sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55 \ - --hash=sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15 \ - --hash=sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e \ - --hash=sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f \ - --hash=sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c \ - --hash=sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a \ - --hash=sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa \ - --hash=sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a \ - --hash=sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd \ - --hash=sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628 \ - --hash=sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef \ - --hash=sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e \ - --hash=sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff \ - --hash=sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b \ - --hash=sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c \ - --hash=sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c \ - --hash=sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f \ - --hash=sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3 \ - --hash=sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6 \ - --hash=sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c \ - --hash=sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147 \ - --hash=sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5 \ - --hash=sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7 \ - --hash=sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710 \ - --hash=sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4 \ - --hash=sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed \ - --hash=sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848 \ - --hash=sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83 \ - --hash=sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444 - # via -r requirements.in -pytest==8.2.2 \ - --hash=sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343 \ - --hash=sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977 - # via -r requirements.in -ruff==0.4.8 \ - --hash=sha256:14019a06dbe29b608f6b7cbcec300e3170a8d86efaddb7b23405cb7f7dcaf780 \ - --hash=sha256:16d717b1d57b2e2fd68bd0bf80fb43931b79d05a7131aa477d66fc40fbd86268 \ - --hash=sha256:284c2e3f3396fb05f5f803c9fffb53ebbe09a3ebe7dda2929ed8d73ded736deb \ - --hash=sha256:384154a1c3f4bf537bac69f33720957ee49ac8d484bfc91720cc94172026ceed \ - --hash=sha256:6d795d7639212c2dfd01991259460101c22aabf420d9b943f153ab9d9706e6a9 \ - --hash=sha256:6ea874950daca5697309d976c9afba830d3bf0ed66887481d6bca1673fc5b66a \ - --hash=sha256:704977a658131651a22b5ebeb28b717ef42ac6ee3b11e91dc87b633b5d83142b \ - --hash=sha256:72584676164e15a68a15778fd1b17c28a519e7a0622161eb2debdcdabdc71883 \ - --hash=sha256:7663a6d78f6adb0eab270fa9cf1ff2d28618ca3a652b60f2a234d92b9ec89066 \ - --hash=sha256:9678d5c9b43315f323af2233a04d747409d1e3aa6789620083a82d1066a35199 \ - --hash=sha256:a7354f921e3fbe04d2a62d46707e569f9315e1a613307f7311a935743c51a764 \ - --hash=sha256:aad360893e92486662ef3be0a339c5ca3c1b109e0134fcd37d534d4be9fb8de3 \ - --hash=sha256:d05f8d6f0c3cce5026cecd83b7a143dcad503045857bc49662f736437380ad45 \ - --hash=sha256:e14a3a095d07560a9d6769a72f781d73259655919d9b396c650fc98a8157555d \ - --hash=sha256:e9d5ce97cacc99878aa0d084c626a15cd21e6b3d53fd6f9112b7fc485918e1fa \ - --hash=sha256:eeceb78da8afb6de0ddada93112869852d04f1cd0f6b80fe464fd4e35c330913 \ - --hash=sha256:fc95aac2943ddf360376be9aa3107c8cf9640083940a8c5bd824be692d2216dc - # via -r requirements.in -toml==0.10.2 \ - --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ - --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f - # via -r requirements.in -typing-extensions==4.12.1 \ - --hash=sha256:6024b58b69089e5a89c347397254e35f1bf02a907728ec7fee9bf0fe837d203a \ - --hash=sha256:915f5e35ff76f56588223f15fdd5938f9a1cf9195c0de25130c627e4d597f6d1 - # via mypy diff --git a/requirements-312.txt b/requirements-312.txt deleted file mode 100644 index e4de5a5d2..000000000 --- a/requirements-312.txt +++ /dev/null @@ -1,184 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.12 -# by the following command: -# -# pip-compile --generate-hashes --output-file=requirements-312.txt -# -iniconfig==2.0.0 \ - --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \ - --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 - # via pytest -maturin==1.6.0 \ - --hash=sha256:16ef860df20028618b5a064da06b02c1c47acba064a4d25aaf84662a459ec599 \ - --hash=sha256:337899784955934dd67b30497d1dd5fab22da89f60bb079dbaf2eaa446b97a10 \ - --hash=sha256:4e931c92037128ade49cd26dd040d9c46ad8092d8170cc44f5c3a0b4a052d576 \ - --hash=sha256:50133965e52d8b5b969381fee3fde111ae2383905cdaba7650f256e08ccddcd4 \ - --hash=sha256:a2a2436628c36d98dabd79b52256df7e12fc4fd1b122984d9373fdf918fd4609 \ - --hash=sha256:aa4eb7dca7d246b466392f21016f67ff09a9aff2305fa714ca25a2344e4639e7 \ - --hash=sha256:b955025c24c8babc808db49e0ff90db8b4b1320dcc16b14eb26132841737230d \ - --hash=sha256:bd85edcb1b8e2bcddc1b7d16ce58ce00a66aa80c422745c8ad9e132ac40d4b48 \ - --hash=sha256:c87d1a7596c42b589099adb831343a56e02373588366e4cede96cbdf8bd68f9d \ - --hash=sha256:d67ca8dc7f3b2314bd3bf83c4de52645e220ee312fd526e53acc6a735f233fad \ - --hash=sha256:d8620970bd0b6a0acb99dbd0b1c2ebb7a69909d25f6023bdff9635a39001aa51 \ - --hash=sha256:d92b045e90ed919a8a2520dda64e3f384e5e746ea51e1498cc6ac3e9e5c76054 \ - --hash=sha256:dbbbf25dc3c207b0a7bd4f3aea1df33d4f22b8508592796a6f36f4d8ed216db0 - # via -r requirements.in -mypy==1.10.0 \ - --hash=sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061 \ - --hash=sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99 \ - --hash=sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de \ - --hash=sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a \ - --hash=sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9 \ - --hash=sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec \ - --hash=sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1 \ - --hash=sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131 \ - --hash=sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f \ - --hash=sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821 \ - --hash=sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5 \ - --hash=sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee \ - --hash=sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e \ - --hash=sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746 \ - --hash=sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2 \ - --hash=sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0 \ - --hash=sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b \ - --hash=sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53 \ - --hash=sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30 \ - --hash=sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda \ - --hash=sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051 \ - --hash=sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2 \ - --hash=sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7 \ - --hash=sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee \ - --hash=sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727 \ - --hash=sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976 \ - --hash=sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4 - # via -r requirements.in -mypy-extensions==1.0.0 \ - --hash=sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d \ - --hash=sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782 - # via mypy -numpy==2.0.0 \ - --hash=sha256:04494f6ec467ccb5369d1808570ae55f6ed9b5809d7f035059000a37b8d7e86f \ - --hash=sha256:0a43f0974d501842866cc83471bdb0116ba0dffdbaac33ec05e6afed5b615238 \ - --hash=sha256:0e50842b2295ba8414c8c1d9d957083d5dfe9e16828b37de883f51fc53c4016f \ - --hash=sha256:0ec84b9ba0654f3b962802edc91424331f423dcf5d5f926676e0150789cb3d95 \ - --hash=sha256:17067d097ed036636fa79f6a869ac26df7db1ba22039d962422506640314933a \ - --hash=sha256:1cde1753efe513705a0c6d28f5884e22bdc30438bf0085c5c486cdaff40cd67a \ - --hash=sha256:1e72728e7501a450288fc8e1f9ebc73d90cfd4671ebbd631f3e7857c39bd16f2 \ - --hash=sha256:2635dbd200c2d6faf2ef9a0d04f0ecc6b13b3cad54f7c67c61155138835515d2 \ - --hash=sha256:2ce46fd0b8a0c947ae047d222f7136fc4d55538741373107574271bc00e20e8f \ - --hash=sha256:34f003cb88b1ba38cb9a9a4a3161c1604973d7f9d5552c38bc2f04f829536609 \ - --hash=sha256:354f373279768fa5a584bac997de6a6c9bc535c482592d7a813bb0c09be6c76f \ - --hash=sha256:38ecb5b0582cd125f67a629072fed6f83562d9dd04d7e03256c9829bdec027ad \ - --hash=sha256:3e8e01233d57639b2e30966c63d36fcea099d17c53bf424d77f088b0f4babd86 \ - --hash=sha256:3f6bed7f840d44c08ebdb73b1825282b801799e325bcbdfa6bc5c370e5aecc65 \ - --hash=sha256:4554eb96f0fd263041baf16cf0881b3f5dafae7a59b1049acb9540c4d57bc8cb \ - --hash=sha256:46e161722e0f619749d1cd892167039015b2c2817296104487cd03ed4a955995 \ - --hash=sha256:49d9f7d256fbc804391a7f72d4a617302b1afac1112fac19b6c6cec63fe7fe8a \ - --hash=sha256:4d2f62e55a4cd9c58c1d9a1c9edaedcd857a73cb6fda875bf79093f9d9086f85 \ - --hash=sha256:5f64641b42b2429f56ee08b4f427a4d2daf916ec59686061de751a55aafa22e4 \ - --hash=sha256:63b92c512d9dbcc37f9d81b123dec99fdb318ba38c8059afc78086fe73820275 \ - --hash=sha256:6d7696c615765091cc5093f76fd1fa069870304beaccfd58b5dcc69e55ef49c1 \ - --hash=sha256:79e843d186c8fb1b102bef3e2bc35ef81160ffef3194646a7fdd6a73c6b97196 \ - --hash=sha256:821eedb7165ead9eebdb569986968b541f9908979c2da8a4967ecac4439bae3d \ - --hash=sha256:84554fc53daa8f6abf8e8a66e076aff6ece62de68523d9f665f32d2fc50fd66e \ - --hash=sha256:8d83bb187fb647643bd56e1ae43f273c7f4dbcdf94550d7938cfc32566756514 \ - --hash=sha256:903703372d46bce88b6920a0cd86c3ad82dae2dbef157b5fc01b70ea1cfc430f \ - --hash=sha256:9416a5c2e92ace094e9f0082c5fd473502c91651fb896bc17690d6fc475128d6 \ - --hash=sha256:9a1712c015831da583b21c5bfe15e8684137097969c6d22e8316ba66b5baabe4 \ - --hash=sha256:9c27f0946a3536403efb0e1c28def1ae6730a72cd0d5878db38824855e3afc44 \ - --hash=sha256:a356364941fb0593bb899a1076b92dfa2029f6f5b8ba88a14fd0984aaf76d0df \ - --hash=sha256:a7039a136017eaa92c1848152827e1424701532ca8e8967fe480fe1569dae581 \ - --hash=sha256:acd3a644e4807e73b4e1867b769fbf1ce8c5d80e7caaef0d90dcdc640dfc9787 \ - --hash=sha256:ad0c86f3455fbd0de6c31a3056eb822fc939f81b1618f10ff3406971893b62a5 \ - --hash=sha256:b4c76e3d4c56f145d41b7b6751255feefae92edbc9a61e1758a98204200f30fc \ - --hash=sha256:b6f6a8f45d0313db07d6d1d37bd0b112f887e1369758a5419c0370ba915b3871 \ - --hash=sha256:c5a59996dc61835133b56a32ebe4ef3740ea5bc19b3983ac60cc32be5a665d54 \ - --hash=sha256:c73aafd1afca80afecb22718f8700b40ac7cab927b8abab3c3e337d70e10e5a2 \ - --hash=sha256:cee6cc0584f71adefe2c908856ccc98702baf95ff80092e4ca46061538a2ba98 \ - --hash=sha256:cef04d068f5fb0518a77857953193b6bb94809a806bd0a14983a8f12ada060c9 \ - --hash=sha256:cf5d1c9e6837f8af9f92b6bd3e86d513cdc11f60fd62185cc49ec7d1aba34864 \ - --hash=sha256:e61155fae27570692ad1d327e81c6cf27d535a5d7ef97648a17d922224b216de \ - --hash=sha256:e7f387600d424f91576af20518334df3d97bc76a300a755f9a8d6e4f5cadd289 \ - --hash=sha256:ed08d2703b5972ec736451b818c2eb9da80d66c3e84aed1deeb0c345fefe461b \ - --hash=sha256:fbd6acc766814ea6443628f4e6751d0da6593dae29c08c0b2606164db026970c \ - --hash=sha256:feff59f27338135776f6d4e2ec7aeeac5d5f7a08a83e80869121ef8164b74af9 - # via - # -r requirements.in - # pyarrow -packaging==24.1 \ - --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ - --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 - # via pytest -pluggy==1.5.0 \ - --hash=sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1 \ - --hash=sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669 - # via pytest -pyarrow==16.1.0 \ - --hash=sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a \ - --hash=sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2 \ - --hash=sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f \ - --hash=sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2 \ - --hash=sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315 \ - --hash=sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9 \ - --hash=sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b \ - --hash=sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55 \ - --hash=sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15 \ - --hash=sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e \ - --hash=sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f \ - --hash=sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c \ - --hash=sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a \ - --hash=sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa \ - --hash=sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a \ - --hash=sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd \ - --hash=sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628 \ - --hash=sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef \ - --hash=sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e \ - --hash=sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff \ - --hash=sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b \ - --hash=sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c \ - --hash=sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c \ - --hash=sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f \ - --hash=sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3 \ - --hash=sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6 \ - --hash=sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c \ - --hash=sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147 \ - --hash=sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5 \ - --hash=sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7 \ - --hash=sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710 \ - --hash=sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4 \ - --hash=sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed \ - --hash=sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848 \ - --hash=sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83 \ - --hash=sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444 - # via -r requirements.in -pytest==8.2.2 \ - --hash=sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343 \ - --hash=sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977 - # via -r requirements.in -ruff==0.4.9 \ - --hash=sha256:06b60f91bfa5514bb689b500a25ba48e897d18fea14dce14b48a0c40d1635893 \ - --hash=sha256:0e8e7b95673f22e0efd3571fb5b0cf71a5eaaa3cc8a776584f3b2cc878e46bff \ - --hash=sha256:2d45ddc6d82e1190ea737341326ecbc9a61447ba331b0a8962869fcada758505 \ - --hash=sha256:4555056049d46d8a381f746680db1c46e67ac3b00d714606304077682832998e \ - --hash=sha256:5d5460f789ccf4efd43f265a58538a2c24dbce15dbf560676e430375f20a8198 \ - --hash=sha256:673bddb893f21ab47a8334c8e0ea7fd6598ecc8e698da75bcd12a7b9d0a3206e \ - --hash=sha256:732dd550bfa5d85af8c3c6cbc47ba5b67c6aed8a89e2f011b908fc88f87649db \ - --hash=sha256:784d3ec9bd6493c3b720a0b76f741e6c2d7d44f6b2be87f5eef1ae8cc1d54c84 \ - --hash=sha256:78de3fdb95c4af084087628132336772b1c5044f6e710739d440fc0bccf4d321 \ - --hash=sha256:8064590fd1a50dcf4909c268b0e7c2498253273309ad3d97e4a752bb9df4f521 \ - --hash=sha256:88bffe9c6a454bf8529f9ab9091c99490578a593cc9f9822b7fc065ee0712a06 \ - --hash=sha256:8c1aff58c31948cc66d0b22951aa19edb5af0a3af40c936340cd32a8b1ab7438 \ - --hash=sha256:98ec2775fd2d856dc405635e5ee4ff177920f2141b8e2d9eb5bd6efd50e80317 \ - --hash=sha256:b262ed08d036ebe162123170b35703aaf9daffecb698cd367a8d585157732991 \ - --hash=sha256:e0a22c4157e53d006530c902107c7f550b9233e9706313ab57b892d7197d8e52 \ - --hash=sha256:e91175fbe48f8a2174c9aad70438fe9cb0a5732c4159b2a10a3565fea2d94cde \ - --hash=sha256:f1cb0828ac9533ba0135d148d214e284711ede33640465e706772645483427e3 - # via -r requirements.in -toml==0.10.2 \ - --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ - --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f - # via -r requirements.in -typing-extensions==4.12.2 \ - --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ - --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 - # via mypy diff --git a/requirements.in b/requirements.in deleted file mode 100644 index 1b7f62052..000000000 --- a/requirements.in +++ /dev/null @@ -1,26 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -maturin>=1.5.1 -mypy -numpy -pyarrow>=11.0.0 -pytest -ruff -toml -importlib_metadata; python_version < "3.8" -PyGitHub diff --git a/uv.lock b/uv.lock new file mode 100644 index 000000000..75d9ed018 --- /dev/null +++ b/uv.lock @@ -0,0 +1,1842 @@ +version = 1 +requires-python = ">=3.8" +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", + "python_full_version < '3.9'", +] + +[[package]] +name = "alabaster" +version = "0.7.13" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/94/71/a8ee96d1fd95ca04a0d2e2d9c4081dac4c2d2b12f7ddb899c8cb9bfd1532/alabaster-0.7.13.tar.gz", hash = "sha256:a27a4a084d5e690e16e01e03ad2b2e552c61a65469419b907243193de1a84ae2", size = 11454 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/88/c7083fc61120ab661c5d0b82cb77079fc1429d3f913a456c1c82cf4658f7/alabaster-0.7.13-py3-none-any.whl", hash = "sha256:1ee19aca801bbabb5ba3f5f258e4422dfa86f82f3e9cefb0859b283cdd7f62a3", size = 13857 }, +] + +[[package]] +name = "alabaster" +version = "0.7.16" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/3e/13dd8e5ed9094e734ac430b5d0eb4f2bb001708a8b7856cbf8e084e001ba/alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", size = 23776 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/34/d4e1c02d3bee589efb5dfa17f88ea08bdb3e3eac12bc475462aec52ed223/alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92", size = 13511 }, +] + +[[package]] +name = "alabaster" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929 }, +] + +[[package]] +name = "appnope" +version = "0.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321 }, +] + +[[package]] +name = "astroid" +version = "3.2.4" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/53/1067e1113ecaf58312357f2cd93063674924119d80d173adc3f6f2387aa2/astroid-3.2.4.tar.gz", hash = "sha256:0e14202810b30da1b735827f78f5157be2bbd4a7a59b7707ca0bfc2fb4c0063a", size = 397576 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/96/b32bbbb46170a1c8b8b1f28c794202e25cfe743565e9d3469b8eb1e0cc05/astroid-3.2.4-py3-none-any.whl", hash = "sha256:413658a61eeca6202a59231abb473f932038fbcbf1666587f66d482083413a25", size = 276348 }, +] + +[[package]] +name = "astroid" +version = "3.3.8" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +dependencies = [ + { name = "typing-extensions", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/80/c5/5c83c48bbf547f3dd8b587529db7cf5a265a3368b33e85e76af8ff6061d3/astroid-3.3.8.tar.gz", hash = "sha256:a88c7994f914a4ea8572fac479459f4955eeccc877be3f2d959a33273b0cf40b", size = 398196 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/28/0bc8a17d6cd4cc3c79ae41b7105a2b9a327c110e5ddd37a8a27b29a5c8a2/astroid-3.3.8-py3-none-any.whl", hash = "sha256:187ccc0c248bfbba564826c26f070494f7bc964fd286b6d9fff4420e55de828c", size = 275153 }, +] + +[[package]] +name = "asttokens" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918 }, +] + +[[package]] +name = "babel" +version = "2.16.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytz", marker = "python_full_version < '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/74/f1bc80f23eeba13393b7222b11d95ca3af2c1e28edca18af487137eefed9/babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316", size = 9348104 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/20/bc79bc575ba2e2a7f70e8a1155618bb1301eaa5132a8271373a6903f73f8/babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b", size = 9587599 }, +] + +[[package]] +name = "backcall" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/40/764a663805d84deee23043e1426a9175567db89c8b3287b5c2ad9f71aa93/backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e", size = 18041 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4c/1c/ff6546b6c12603d8dd1070aa3c3d273ad4c07f5771689a7b69a550e8c951/backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255", size = 11157 }, +] + +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/ca/824b1195773ce6166d388573fc106ce56d4a805bd7427b624e063596ec58/beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051", size = 581181 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/fe/e8c672695b37eecc5cbf43e1d0638d88d66ba3a44c4d321c796f4e59167f/beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed", size = 147925 }, +] + +[[package]] +name = "certifi" +version = "2024.12.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/bd/1d41ee578ce09523c81a15426705dd20969f5abf006d1afe8aeff0dd776a/certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db", size = 166010 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a5/32/8f6669fc4798494966bf446c8c4a162e0b5d893dff088afddf76414f70e1/certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56", size = 164927 }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/16/b0/572805e227f01586461c80e0fd25d65a2115599cc9dad142fee4b747c357/charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3", size = 123188 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/58/5580c1716040bc89206c77d8f74418caf82ce519aae06450393ca73475d1/charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de", size = 198013 }, + { url = "https://files.pythonhosted.org/packages/d0/11/00341177ae71c6f5159a08168bcb98c6e6d196d372c94511f9f6c9afe0c6/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176", size = 141285 }, + { url = "https://files.pythonhosted.org/packages/01/09/11d684ea5819e5a8f5100fb0b38cf8d02b514746607934134d31233e02c8/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037", size = 151449 }, + { url = "https://files.pythonhosted.org/packages/08/06/9f5a12939db324d905dc1f70591ae7d7898d030d7662f0d426e2286f68c9/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f", size = 143892 }, + { url = "https://files.pythonhosted.org/packages/93/62/5e89cdfe04584cb7f4d36003ffa2936681b03ecc0754f8e969c2becb7e24/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a", size = 146123 }, + { url = "https://files.pythonhosted.org/packages/a9/ac/ab729a15c516da2ab70a05f8722ecfccc3f04ed7a18e45c75bbbaa347d61/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a", size = 147943 }, + { url = "https://files.pythonhosted.org/packages/03/d2/3f392f23f042615689456e9a274640c1d2e5dd1d52de36ab8f7955f8f050/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247", size = 142063 }, + { url = "https://files.pythonhosted.org/packages/f2/e3/e20aae5e1039a2cd9b08d9205f52142329f887f8cf70da3650326670bddf/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408", size = 150578 }, + { url = "https://files.pythonhosted.org/packages/8d/af/779ad72a4da0aed925e1139d458adc486e61076d7ecdcc09e610ea8678db/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb", size = 153629 }, + { url = "https://files.pythonhosted.org/packages/c2/b6/7aa450b278e7aa92cf7732140bfd8be21f5f29d5bf334ae987c945276639/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d", size = 150778 }, + { url = "https://files.pythonhosted.org/packages/39/f4/d9f4f712d0951dcbfd42920d3db81b00dd23b6ab520419626f4023334056/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807", size = 146453 }, + { url = "https://files.pythonhosted.org/packages/49/2b/999d0314e4ee0cff3cb83e6bc9aeddd397eeed693edb4facb901eb8fbb69/charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f", size = 95479 }, + { url = "https://files.pythonhosted.org/packages/2d/ce/3cbed41cff67e455a386fb5e5dd8906cdda2ed92fbc6297921f2e4419309/charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f", size = 102790 }, + { url = "https://files.pythonhosted.org/packages/72/80/41ef5d5a7935d2d3a773e3eaebf0a9350542f2cab4eac59a7a4741fbbbbe/charset_normalizer-3.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125", size = 194995 }, + { url = "https://files.pythonhosted.org/packages/7a/28/0b9fefa7b8b080ec492110af6d88aa3dea91c464b17d53474b6e9ba5d2c5/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1", size = 139471 }, + { url = "https://files.pythonhosted.org/packages/71/64/d24ab1a997efb06402e3fc07317e94da358e2585165930d9d59ad45fcae2/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3", size = 149831 }, + { url = "https://files.pythonhosted.org/packages/37/ed/be39e5258e198655240db5e19e0b11379163ad7070962d6b0c87ed2c4d39/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd", size = 142335 }, + { url = "https://files.pythonhosted.org/packages/88/83/489e9504711fa05d8dde1574996408026bdbdbd938f23be67deebb5eca92/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00", size = 143862 }, + { url = "https://files.pythonhosted.org/packages/c6/c7/32da20821cf387b759ad24627a9aca289d2822de929b8a41b6241767b461/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12", size = 145673 }, + { url = "https://files.pythonhosted.org/packages/68/85/f4288e96039abdd5aeb5c546fa20a37b50da71b5cf01e75e87f16cd43304/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77", size = 140211 }, + { url = "https://files.pythonhosted.org/packages/28/a3/a42e70d03cbdabc18997baf4f0227c73591a08041c149e710045c281f97b/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146", size = 148039 }, + { url = "https://files.pythonhosted.org/packages/85/e4/65699e8ab3014ecbe6f5c71d1a55d810fb716bbfd74f6283d5c2aa87febf/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd", size = 151939 }, + { url = "https://files.pythonhosted.org/packages/b1/82/8e9fe624cc5374193de6860aba3ea8070f584c8565ee77c168ec13274bd2/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6", size = 149075 }, + { url = "https://files.pythonhosted.org/packages/3d/7b/82865ba54c765560c8433f65e8acb9217cb839a9e32b42af4aa8e945870f/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8", size = 144340 }, + { url = "https://files.pythonhosted.org/packages/b5/b6/9674a4b7d4d99a0d2df9b215da766ee682718f88055751e1e5e753c82db0/charset_normalizer-3.4.1-cp311-cp311-win32.whl", hash = "sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b", size = 95205 }, + { url = "https://files.pythonhosted.org/packages/1e/ab/45b180e175de4402dcf7547e4fb617283bae54ce35c27930a6f35b6bef15/charset_normalizer-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76", size = 102441 }, + { url = "https://files.pythonhosted.org/packages/0a/9a/dd1e1cdceb841925b7798369a09279bd1cf183cef0f9ddf15a3a6502ee45/charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545", size = 196105 }, + { url = "https://files.pythonhosted.org/packages/d3/8c/90bfabf8c4809ecb648f39794cf2a84ff2e7d2a6cf159fe68d9a26160467/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7", size = 140404 }, + { url = "https://files.pythonhosted.org/packages/ad/8f/e410d57c721945ea3b4f1a04b74f70ce8fa800d393d72899f0a40526401f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757", size = 150423 }, + { url = "https://files.pythonhosted.org/packages/f0/b8/e6825e25deb691ff98cf5c9072ee0605dc2acfca98af70c2d1b1bc75190d/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa", size = 143184 }, + { url = "https://files.pythonhosted.org/packages/3e/a2/513f6cbe752421f16d969e32f3583762bfd583848b763913ddab8d9bfd4f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d", size = 145268 }, + { url = "https://files.pythonhosted.org/packages/74/94/8a5277664f27c3c438546f3eb53b33f5b19568eb7424736bdc440a88a31f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616", size = 147601 }, + { url = "https://files.pythonhosted.org/packages/7c/5f/6d352c51ee763623a98e31194823518e09bfa48be2a7e8383cf691bbb3d0/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b", size = 141098 }, + { url = "https://files.pythonhosted.org/packages/78/d4/f5704cb629ba5ab16d1d3d741396aec6dc3ca2b67757c45b0599bb010478/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d", size = 149520 }, + { url = "https://files.pythonhosted.org/packages/c5/96/64120b1d02b81785f222b976c0fb79a35875457fa9bb40827678e54d1bc8/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a", size = 152852 }, + { url = "https://files.pythonhosted.org/packages/84/c9/98e3732278a99f47d487fd3468bc60b882920cef29d1fa6ca460a1fdf4e6/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9", size = 150488 }, + { url = "https://files.pythonhosted.org/packages/13/0e/9c8d4cb99c98c1007cc11eda969ebfe837bbbd0acdb4736d228ccaabcd22/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1", size = 146192 }, + { url = "https://files.pythonhosted.org/packages/b2/21/2b6b5b860781a0b49427309cb8670785aa543fb2178de875b87b9cc97746/charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35", size = 95550 }, + { url = "https://files.pythonhosted.org/packages/21/5b/1b390b03b1d16c7e382b561c5329f83cc06623916aab983e8ab9239c7d5c/charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f", size = 102785 }, + { url = "https://files.pythonhosted.org/packages/38/94/ce8e6f63d18049672c76d07d119304e1e2d7c6098f0841b51c666e9f44a0/charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda", size = 195698 }, + { url = "https://files.pythonhosted.org/packages/24/2e/dfdd9770664aae179a96561cc6952ff08f9a8cd09a908f259a9dfa063568/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313", size = 140162 }, + { url = "https://files.pythonhosted.org/packages/24/4e/f646b9093cff8fc86f2d60af2de4dc17c759de9d554f130b140ea4738ca6/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9", size = 150263 }, + { url = "https://files.pythonhosted.org/packages/5e/67/2937f8d548c3ef6e2f9aab0f6e21001056f692d43282b165e7c56023e6dd/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b", size = 142966 }, + { url = "https://files.pythonhosted.org/packages/52/ed/b7f4f07de100bdb95c1756d3a4d17b90c1a3c53715c1a476f8738058e0fa/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11", size = 144992 }, + { url = "https://files.pythonhosted.org/packages/96/2c/d49710a6dbcd3776265f4c923bb73ebe83933dfbaa841c5da850fe0fd20b/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f", size = 147162 }, + { url = "https://files.pythonhosted.org/packages/b4/41/35ff1f9a6bd380303dea55e44c4933b4cc3c4850988927d4082ada230273/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd", size = 140972 }, + { url = "https://files.pythonhosted.org/packages/fb/43/c6a0b685fe6910d08ba971f62cd9c3e862a85770395ba5d9cad4fede33ab/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2", size = 149095 }, + { url = "https://files.pythonhosted.org/packages/4c/ff/a9a504662452e2d2878512115638966e75633519ec11f25fca3d2049a94a/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886", size = 152668 }, + { url = "https://files.pythonhosted.org/packages/6c/71/189996b6d9a4b932564701628af5cee6716733e9165af1d5e1b285c530ed/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601", size = 150073 }, + { url = "https://files.pythonhosted.org/packages/e4/93/946a86ce20790e11312c87c75ba68d5f6ad2208cfb52b2d6a2c32840d922/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd", size = 145732 }, + { url = "https://files.pythonhosted.org/packages/cd/e5/131d2fb1b0dddafc37be4f3a2fa79aa4c037368be9423061dccadfd90091/charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407", size = 95391 }, + { url = "https://files.pythonhosted.org/packages/27/f2/4f9a69cc7712b9b5ad8fdb87039fd89abba997ad5cbe690d1835d40405b0/charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971", size = 102702 }, + { url = "https://files.pythonhosted.org/packages/10/bd/6517ea94f2672e801011d50b5d06be2a0deaf566aea27bcdcd47e5195357/charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c", size = 195653 }, + { url = "https://files.pythonhosted.org/packages/e5/0d/815a2ba3f283b4eeaa5ece57acade365c5b4135f65a807a083c818716582/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9", size = 140701 }, + { url = "https://files.pythonhosted.org/packages/aa/17/c94be7ee0d142687e047fe1de72060f6d6837f40eedc26e87e6e124a3fc6/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8", size = 150495 }, + { url = "https://files.pythonhosted.org/packages/f7/33/557ac796c47165fc141e4fb71d7b0310f67e05cb420756f3a82e0a0068e0/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6", size = 142946 }, + { url = "https://files.pythonhosted.org/packages/1e/0d/38ef4ae41e9248d63fc4998d933cae22473b1b2ac4122cf908d0f5eb32aa/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c", size = 144737 }, + { url = "https://files.pythonhosted.org/packages/43/01/754cdb29dd0560f58290aaaa284d43eea343ad0512e6ad3b8b5c11f08592/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a", size = 147471 }, + { url = "https://files.pythonhosted.org/packages/ba/cd/861883ba5160c7a9bd242c30b2c71074cda2aefcc0addc91118e0d4e0765/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd", size = 140801 }, + { url = "https://files.pythonhosted.org/packages/6f/7f/0c0dad447819e90b93f8ed238cc8f11b91353c23c19e70fa80483a155bed/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd", size = 149312 }, + { url = "https://files.pythonhosted.org/packages/8e/09/9f8abcc6fff60fb727268b63c376c8c79cc37b833c2dfe1f535dfb59523b/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824", size = 152347 }, + { url = "https://files.pythonhosted.org/packages/be/e5/3f363dad2e24378f88ccf63ecc39e817c29f32e308ef21a7a6d9c1201165/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca", size = 149888 }, + { url = "https://files.pythonhosted.org/packages/e4/10/a78c0e91f487b4ad0ef7480ac765e15b774f83de2597f1b6ef0eaf7a2f99/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b", size = 145169 }, + { url = "https://files.pythonhosted.org/packages/d3/81/396e7d7f5d7420da8273c91175d2e9a3f569288e3611d521685e4b9ac9cc/charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e", size = 95094 }, + { url = "https://files.pythonhosted.org/packages/40/bb/20affbbd9ea29c71ea123769dc568a6d42052ff5089c5fe23e21e21084a6/charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4", size = 102139 }, + { url = "https://files.pythonhosted.org/packages/7f/c0/b913f8f02836ed9ab32ea643c6fe4d3325c3d8627cf6e78098671cafff86/charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41", size = 197867 }, + { url = "https://files.pythonhosted.org/packages/0f/6c/2bee440303d705b6fb1e2ec789543edec83d32d258299b16eed28aad48e0/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f", size = 141385 }, + { url = "https://files.pythonhosted.org/packages/3d/04/cb42585f07f6f9fd3219ffb6f37d5a39b4fd2db2355b23683060029c35f7/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2", size = 151367 }, + { url = "https://files.pythonhosted.org/packages/54/54/2412a5b093acb17f0222de007cc129ec0e0df198b5ad2ce5699355269dfe/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770", size = 143928 }, + { url = "https://files.pythonhosted.org/packages/5a/6d/e2773862b043dcf8a221342954f375392bb2ce6487bcd9f2c1b34e1d6781/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4", size = 146203 }, + { url = "https://files.pythonhosted.org/packages/b9/f8/ca440ef60d8f8916022859885f231abb07ada3c347c03d63f283bec32ef5/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537", size = 148082 }, + { url = "https://files.pythonhosted.org/packages/04/d2/42fd330901aaa4b805a1097856c2edf5095e260a597f65def493f4b8c833/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496", size = 142053 }, + { url = "https://files.pythonhosted.org/packages/9e/af/3a97a4fa3c53586f1910dadfc916e9c4f35eeada36de4108f5096cb7215f/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78", size = 150625 }, + { url = "https://files.pythonhosted.org/packages/26/ae/23d6041322a3556e4da139663d02fb1b3c59a23ab2e2b56432bd2ad63ded/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7", size = 153549 }, + { url = "https://files.pythonhosted.org/packages/94/22/b8f2081c6a77cb20d97e57e0b385b481887aa08019d2459dc2858ed64871/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6", size = 150945 }, + { url = "https://files.pythonhosted.org/packages/c7/0b/c5ec5092747f801b8b093cdf5610e732b809d6cb11f4c51e35fc28d1d389/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294", size = 146595 }, + { url = "https://files.pythonhosted.org/packages/0c/5a/0b59704c38470df6768aa154cc87b1ac7c9bb687990a1559dc8765e8627e/charset_normalizer-3.4.1-cp39-cp39-win32.whl", hash = "sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5", size = 95453 }, + { url = "https://files.pythonhosted.org/packages/85/2d/a9790237cb4d01a6d57afadc8573c8b73c609ade20b80f4cda30802009ee/charset_normalizer-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765", size = 102811 }, + { url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, +] + +[[package]] +name = "datafusion" +version = "43.0.0" +source = { editable = "." } +dependencies = [ + { name = "pyarrow", version = "17.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "pyarrow", version = "18.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] + +[package.dev-dependencies] +dev = [ + { name = "maturin" }, + { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "pytest" }, + { name = "ruff" }, + { name = "toml" }, +] +docs = [ + { name = "ipython", version = "8.12.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "ipython", version = "8.18.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "ipython", version = "8.31.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "jinja2" }, + { name = "myst-parser", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "myst-parser", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "pandas", version = "2.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "pandas", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "pickleshare" }, + { name = "pydata-sphinx-theme" }, + { name = "setuptools", version = "75.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "setuptools", version = "75.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinx-autoapi" }, +] + +[package.metadata] +requires-dist = [ + { name = "pyarrow", specifier = ">=11.0.0" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "maturin", specifier = ">=1.8.1" }, + { name = "numpy", marker = "python_full_version >= '3.10'", specifier = ">1.24.4" }, + { name = "pytest", specifier = ">=7.4.4" }, + { name = "ruff", specifier = ">=0.9.1" }, + { name = "toml", specifier = ">=0.10.2" }, +] +docs = [ + { name = "ipython", specifier = ">=8.12.3" }, + { name = "jinja2", specifier = ">=3.1.5" }, + { name = "myst-parser", specifier = ">=3.0.1" }, + { name = "pandas", specifier = ">=2.0.3" }, + { name = "pickleshare", specifier = ">=0.7.5" }, + { name = "pydata-sphinx-theme", specifier = "==0.8.0" }, + { name = "setuptools", specifier = ">=75.3.0" }, + { name = "sphinx", specifier = ">=7.1.2" }, + { name = "sphinx-autoapi", specifier = ">=3.4.0" }, +] + +[[package]] +name = "decorator" +version = "5.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/0c/8d907af351aa16b42caae42f9d6aa37b900c67308052d10fdce809f8d952/decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330", size = 35016 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 }, +] + +[[package]] +name = "docutils" +version = "0.20.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/1f/53/a5da4f2c5739cf66290fac1431ee52aff6851c7c8ffd8264f13affd7bcdd/docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b", size = 2058365 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/87/f238c0670b94533ac0353a4e2a1a771a0cc73277b88bff23d3ae35a256c1/docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6", size = 572666 }, +] + +[[package]] +name = "docutils" +version = "0.21.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 }, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 }, +] + +[[package]] +name = "executing" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8c/e3/7d45f492c2c4a0e8e0fad57d081a7c8a0286cdd86372b070cca1ec0caa1e/executing-2.1.0.tar.gz", hash = "sha256:8ea27ddd260da8150fa5a708269c4a10e76161e2496ec3e587da9e3c0fe4b9ab", size = 977485 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl", hash = "sha256:8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf", size = 25805 }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, +] + +[[package]] +name = "imagesize" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769 }, +] + +[[package]] +name = "importlib-metadata" +version = "8.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp", version = "3.20.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "zipp", version = "3.21.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/d9/a1e041c5e7caa9a05c925f4bdbdfb7f006d1f74996af53467bc394c97be7/importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b", size = 26514 }, +] + +[[package]] +name = "iniconfig" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, +] + +[[package]] +name = "ipython" +version = "8.12.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +dependencies = [ + { name = "appnope", marker = "python_full_version < '3.9' and sys_platform == 'darwin'" }, + { name = "backcall", marker = "python_full_version < '3.9'" }, + { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" }, + { name = "decorator", marker = "python_full_version < '3.9'" }, + { name = "jedi", marker = "python_full_version < '3.9'" }, + { name = "matplotlib-inline", marker = "python_full_version < '3.9'" }, + { name = "pexpect", marker = "python_full_version < '3.9' and sys_platform != 'win32'" }, + { name = "pickleshare", marker = "python_full_version < '3.9'" }, + { name = "prompt-toolkit", marker = "python_full_version < '3.9'" }, + { name = "pygments", marker = "python_full_version < '3.9'" }, + { name = "stack-data", marker = "python_full_version < '3.9'" }, + { name = "traitlets", marker = "python_full_version < '3.9'" }, + { name = "typing-extensions", marker = "python_full_version < '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/6a/44ef299b1762f5a73841e87fae8a73a8cc8aee538d6dc8c77a5afe1fd2ce/ipython-8.12.3.tar.gz", hash = "sha256:3910c4b54543c2ad73d06579aa771041b7d5707b033bd488669b4cf544e3b363", size = 5470171 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8d/97/8fe103906cd81bc42d3b0175b5534a9f67dccae47d6451131cf8d0d70bb2/ipython-8.12.3-py3-none-any.whl", hash = "sha256:b0340d46a933d27c657b211a329d0be23793c36595acf9e6ef4164bc01a1804c", size = 798307 }, +] + +[[package]] +name = "ipython" +version = "8.18.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version == '3.9.*'", +] +dependencies = [ + { name = "colorama", marker = "python_full_version == '3.9.*' and sys_platform == 'win32'" }, + { name = "decorator", marker = "python_full_version == '3.9.*'" }, + { name = "exceptiongroup", marker = "python_full_version == '3.9.*'" }, + { name = "jedi", marker = "python_full_version == '3.9.*'" }, + { name = "matplotlib-inline", marker = "python_full_version == '3.9.*'" }, + { name = "pexpect", marker = "python_full_version == '3.9.*' and sys_platform != 'win32'" }, + { name = "prompt-toolkit", marker = "python_full_version == '3.9.*'" }, + { name = "pygments", marker = "python_full_version == '3.9.*'" }, + { name = "stack-data", marker = "python_full_version == '3.9.*'" }, + { name = "traitlets", marker = "python_full_version == '3.9.*'" }, + { name = "typing-extensions", marker = "python_full_version == '3.9.*'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/b9/3ba6c45a6df813c09a48bac313c22ff83efa26cbb55011218d925a46e2ad/ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27", size = 5486330 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/6b/d9fdcdef2eb6a23f391251fde8781c38d42acd82abe84d054cb74f7863b0/ipython-8.18.1-py3-none-any.whl", hash = "sha256:e8267419d72d81955ec1177f8a29aaa90ac80ad647499201119e2f05e99aa397", size = 808161 }, +] + +[[package]] +name = "ipython" +version = "8.31.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +dependencies = [ + { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" }, + { name = "decorator", marker = "python_full_version >= '3.10'" }, + { name = "exceptiongroup", marker = "python_full_version == '3.10.*'" }, + { name = "jedi", marker = "python_full_version >= '3.10'" }, + { name = "matplotlib-inline", marker = "python_full_version >= '3.10'" }, + { name = "pexpect", marker = "python_full_version >= '3.10' and sys_platform != 'emscripten' and sys_platform != 'win32'" }, + { name = "prompt-toolkit", marker = "python_full_version >= '3.10'" }, + { name = "pygments", marker = "python_full_version >= '3.10'" }, + { name = "stack-data", marker = "python_full_version >= '3.10'" }, + { name = "traitlets", marker = "python_full_version >= '3.10'" }, + { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/01/35/6f90fdddff7a08b7b715fccbd2427b5212c9525cd043d26fdc45bee0708d/ipython-8.31.0.tar.gz", hash = "sha256:b6a2274606bec6166405ff05e54932ed6e5cfecaca1fc05f2cacde7bb074d70b", size = 5501011 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/60/d0feb6b6d9fe4ab89fe8fe5b47cbf6cd936bfd9f1e7ffa9d0015425aeed6/ipython-8.31.0-py3-none-any.whl", hash = "sha256:46ec58f8d3d076a61d128fe517a51eb730e3aaf0c184ea8c17d16e366660c6a6", size = 821583 }, +] + +[[package]] +name = "jedi" +version = "0.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "parso" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278 }, +] + +[[package]] +name = "jinja2" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe", version = "2.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "markupsafe", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/af/92/b3130cbbf5591acf9ade8708c365f3238046ac7cb8ccba6e81abccb0ccff/jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb", size = 244674 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bd/0f/2ba5fbcd631e3e88689309dbe978c5769e883e4b84ebfe7da30b43275c5a/jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb", size = 134596 }, +] + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 }, +] + +[[package]] +name = "markupsafe" +version = "2.1.5" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/87/5b/aae44c6655f3801e81aa3eef09dbbf012431987ba564d7231722f68df02d/MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b", size = 19384 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e4/54/ad5eb37bf9d51800010a74e4665425831a9db4e7c4e0fde4352e391e808e/MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc", size = 18206 }, + { url = "https://files.pythonhosted.org/packages/6a/4a/a4d49415e600bacae038c67f9fecc1d5433b9d3c71a4de6f33537b89654c/MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5", size = 14079 }, + { url = "https://files.pythonhosted.org/packages/0a/7b/85681ae3c33c385b10ac0f8dd025c30af83c78cec1c37a6aa3b55e67f5ec/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46", size = 26620 }, + { url = "https://files.pythonhosted.org/packages/7c/52/2b1b570f6b8b803cef5ac28fdf78c0da318916c7d2fe9402a84d591b394c/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f", size = 25818 }, + { url = "https://files.pythonhosted.org/packages/29/fe/a36ba8c7ca55621620b2d7c585313efd10729e63ef81e4e61f52330da781/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900", size = 25493 }, + { url = "https://files.pythonhosted.org/packages/60/ae/9c60231cdfda003434e8bd27282b1f4e197ad5a710c14bee8bea8a9ca4f0/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff", size = 30630 }, + { url = "https://files.pythonhosted.org/packages/65/dc/1510be4d179869f5dafe071aecb3f1f41b45d37c02329dfba01ff59e5ac5/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad", size = 29745 }, + { url = "https://files.pythonhosted.org/packages/30/39/8d845dd7d0b0613d86e0ef89549bfb5f61ed781f59af45fc96496e897f3a/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd", size = 30021 }, + { url = "https://files.pythonhosted.org/packages/c7/5c/356a6f62e4f3c5fbf2602b4771376af22a3b16efa74eb8716fb4e328e01e/MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4", size = 16659 }, + { url = "https://files.pythonhosted.org/packages/69/48/acbf292615c65f0604a0c6fc402ce6d8c991276e16c80c46a8f758fbd30c/MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5", size = 17213 }, + { url = "https://files.pythonhosted.org/packages/11/e7/291e55127bb2ae67c64d66cef01432b5933859dfb7d6949daa721b89d0b3/MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f", size = 18219 }, + { url = "https://files.pythonhosted.org/packages/6b/cb/aed7a284c00dfa7c0682d14df85ad4955a350a21d2e3b06d8240497359bf/MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2", size = 14098 }, + { url = "https://files.pythonhosted.org/packages/1c/cf/35fe557e53709e93feb65575c93927942087e9b97213eabc3fe9d5b25a55/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced", size = 29014 }, + { url = "https://files.pythonhosted.org/packages/97/18/c30da5e7a0e7f4603abfc6780574131221d9148f323752c2755d48abad30/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5", size = 28220 }, + { url = "https://files.pythonhosted.org/packages/0c/40/2e73e7d532d030b1e41180807a80d564eda53babaf04d65e15c1cf897e40/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c", size = 27756 }, + { url = "https://files.pythonhosted.org/packages/18/46/5dca760547e8c59c5311b332f70605d24c99d1303dd9a6e1fc3ed0d73561/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f", size = 33988 }, + { url = "https://files.pythonhosted.org/packages/6d/c5/27febe918ac36397919cd4a67d5579cbbfa8da027fa1238af6285bb368ea/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a", size = 32718 }, + { url = "https://files.pythonhosted.org/packages/f8/81/56e567126a2c2bc2684d6391332e357589a96a76cb9f8e5052d85cb0ead8/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f", size = 33317 }, + { url = "https://files.pythonhosted.org/packages/00/0b/23f4b2470accb53285c613a3ab9ec19dc944eaf53592cb6d9e2af8aa24cc/MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906", size = 16670 }, + { url = "https://files.pythonhosted.org/packages/b7/a2/c78a06a9ec6d04b3445a949615c4c7ed86a0b2eb68e44e7541b9d57067cc/MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617", size = 17224 }, + { url = "https://files.pythonhosted.org/packages/53/bd/583bf3e4c8d6a321938c13f49d44024dbe5ed63e0a7ba127e454a66da974/MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1", size = 18215 }, + { url = "https://files.pythonhosted.org/packages/48/d6/e7cd795fc710292c3af3a06d80868ce4b02bfbbf370b7cee11d282815a2a/MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4", size = 14069 }, + { url = "https://files.pythonhosted.org/packages/51/b5/5d8ec796e2a08fc814a2c7d2584b55f889a55cf17dd1a90f2beb70744e5c/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee", size = 29452 }, + { url = "https://files.pythonhosted.org/packages/0a/0d/2454f072fae3b5a137c119abf15465d1771319dfe9e4acbb31722a0fff91/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5", size = 28462 }, + { url = "https://files.pythonhosted.org/packages/2d/75/fd6cb2e68780f72d47e6671840ca517bda5ef663d30ada7616b0462ad1e3/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b", size = 27869 }, + { url = "https://files.pythonhosted.org/packages/b0/81/147c477391c2750e8fc7705829f7351cf1cd3be64406edcf900dc633feb2/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a", size = 33906 }, + { url = "https://files.pythonhosted.org/packages/8b/ff/9a52b71839d7a256b563e85d11050e307121000dcebc97df120176b3ad93/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f", size = 32296 }, + { url = "https://files.pythonhosted.org/packages/88/07/2dc76aa51b481eb96a4c3198894f38b480490e834479611a4053fbf08623/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169", size = 33038 }, + { url = "https://files.pythonhosted.org/packages/96/0c/620c1fb3661858c0e37eb3cbffd8c6f732a67cd97296f725789679801b31/MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad", size = 16572 }, + { url = "https://files.pythonhosted.org/packages/3f/14/c3554d512d5f9100a95e737502f4a2323a1959f6d0d01e0d0997b35f7b10/MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb", size = 17127 }, + { url = "https://files.pythonhosted.org/packages/f8/ff/2c942a82c35a49df5de3a630ce0a8456ac2969691b230e530ac12314364c/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a", size = 18192 }, + { url = "https://files.pythonhosted.org/packages/4f/14/6f294b9c4f969d0c801a4615e221c1e084722ea6114ab2114189c5b8cbe0/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46", size = 14072 }, + { url = "https://files.pythonhosted.org/packages/81/d4/fd74714ed30a1dedd0b82427c02fa4deec64f173831ec716da11c51a50aa/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532", size = 26928 }, + { url = "https://files.pythonhosted.org/packages/c7/bd/50319665ce81bb10e90d1cf76f9e1aa269ea6f7fa30ab4521f14d122a3df/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab", size = 26106 }, + { url = "https://files.pythonhosted.org/packages/4c/6f/f2b0f675635b05f6afd5ea03c094557bdb8622fa8e673387444fe8d8e787/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68", size = 25781 }, + { url = "https://files.pythonhosted.org/packages/51/e0/393467cf899b34a9d3678e78961c2c8cdf49fb902a959ba54ece01273fb1/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0", size = 30518 }, + { url = "https://files.pythonhosted.org/packages/f6/02/5437e2ad33047290dafced9df741d9efc3e716b75583bbd73a9984f1b6f7/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4", size = 29669 }, + { url = "https://files.pythonhosted.org/packages/0e/7d/968284145ffd9d726183ed6237c77938c021abacde4e073020f920e060b2/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3", size = 29933 }, + { url = "https://files.pythonhosted.org/packages/bf/f3/ecb00fc8ab02b7beae8699f34db9357ae49d9f21d4d3de6f305f34fa949e/MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff", size = 16656 }, + { url = "https://files.pythonhosted.org/packages/92/21/357205f03514a49b293e214ac39de01fadd0970a6e05e4bf1ddd0ffd0881/MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029", size = 17206 }, + { url = "https://files.pythonhosted.org/packages/0f/31/780bb297db036ba7b7bbede5e1d7f1e14d704ad4beb3ce53fb495d22bc62/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf", size = 18193 }, + { url = "https://files.pythonhosted.org/packages/6c/77/d77701bbef72892affe060cdacb7a2ed7fd68dae3b477a8642f15ad3b132/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2", size = 14073 }, + { url = "https://files.pythonhosted.org/packages/d9/a7/1e558b4f78454c8a3a0199292d96159eb4d091f983bc35ef258314fe7269/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8", size = 26486 }, + { url = "https://files.pythonhosted.org/packages/5f/5a/360da85076688755ea0cceb92472923086993e86b5613bbae9fbc14136b0/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3", size = 25685 }, + { url = "https://files.pythonhosted.org/packages/6a/18/ae5a258e3401f9b8312f92b028c54d7026a97ec3ab20bfaddbdfa7d8cce8/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465", size = 25338 }, + { url = "https://files.pythonhosted.org/packages/0b/cc/48206bd61c5b9d0129f4d75243b156929b04c94c09041321456fd06a876d/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e", size = 30439 }, + { url = "https://files.pythonhosted.org/packages/d1/06/a41c112ab9ffdeeb5f77bc3e331fdadf97fa65e52e44ba31880f4e7f983c/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea", size = 29531 }, + { url = "https://files.pythonhosted.org/packages/02/8c/ab9a463301a50dab04d5472e998acbd4080597abc048166ded5c7aa768c8/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6", size = 29823 }, + { url = "https://files.pythonhosted.org/packages/bc/29/9bc18da763496b055d8e98ce476c8e718dcfd78157e17f555ce6dd7d0895/MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf", size = 16658 }, + { url = "https://files.pythonhosted.org/packages/f6/f8/4da07de16f10551ca1f640c92b5f316f9394088b183c6a57183df6de5ae4/MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5", size = 17211 }, +] + +[[package]] +name = "markupsafe" +version = "3.0.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/90/d08277ce111dd22f77149fd1a5d4653eeb3b3eaacbdfcbae5afb2600eebd/MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8", size = 14357 }, + { url = "https://files.pythonhosted.org/packages/04/e1/6e2194baeae0bca1fae6629dc0cbbb968d4d941469cbab11a3872edff374/MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158", size = 12393 }, + { url = "https://files.pythonhosted.org/packages/1d/69/35fa85a8ece0a437493dc61ce0bb6d459dcba482c34197e3efc829aa357f/MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579", size = 21732 }, + { url = "https://files.pythonhosted.org/packages/22/35/137da042dfb4720b638d2937c38a9c2df83fe32d20e8c8f3185dbfef05f7/MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d", size = 20866 }, + { url = "https://files.pythonhosted.org/packages/29/28/6d029a903727a1b62edb51863232152fd335d602def598dade38996887f0/MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb", size = 20964 }, + { url = "https://files.pythonhosted.org/packages/cc/cd/07438f95f83e8bc028279909d9c9bd39e24149b0d60053a97b2bc4f8aa51/MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b", size = 21977 }, + { url = "https://files.pythonhosted.org/packages/29/01/84b57395b4cc062f9c4c55ce0df7d3108ca32397299d9df00fedd9117d3d/MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c", size = 21366 }, + { url = "https://files.pythonhosted.org/packages/bd/6e/61ebf08d8940553afff20d1fb1ba7294b6f8d279df9fd0c0db911b4bbcfd/MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171", size = 21091 }, + { url = "https://files.pythonhosted.org/packages/11/23/ffbf53694e8c94ebd1e7e491de185124277964344733c45481f32ede2499/MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50", size = 15065 }, + { url = "https://files.pythonhosted.org/packages/44/06/e7175d06dd6e9172d4a69a72592cb3f7a996a9c396eee29082826449bbc3/MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a", size = 15514 }, + { url = "https://files.pythonhosted.org/packages/6b/28/bbf83e3f76936960b850435576dd5e67034e200469571be53f69174a2dfd/MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d", size = 14353 }, + { url = "https://files.pythonhosted.org/packages/6c/30/316d194b093cde57d448a4c3209f22e3046c5bb2fb0820b118292b334be7/MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93", size = 12392 }, + { url = "https://files.pythonhosted.org/packages/f2/96/9cdafba8445d3a53cae530aaf83c38ec64c4d5427d975c974084af5bc5d2/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832", size = 23984 }, + { url = "https://files.pythonhosted.org/packages/f1/a4/aefb044a2cd8d7334c8a47d3fb2c9f328ac48cb349468cc31c20b539305f/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84", size = 23120 }, + { url = "https://files.pythonhosted.org/packages/8d/21/5e4851379f88f3fad1de30361db501300d4f07bcad047d3cb0449fc51f8c/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca", size = 23032 }, + { url = "https://files.pythonhosted.org/packages/00/7b/e92c64e079b2d0d7ddf69899c98842f3f9a60a1ae72657c89ce2655c999d/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798", size = 24057 }, + { url = "https://files.pythonhosted.org/packages/f9/ac/46f960ca323037caa0a10662ef97d0a4728e890334fc156b9f9e52bcc4ca/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e", size = 23359 }, + { url = "https://files.pythonhosted.org/packages/69/84/83439e16197337b8b14b6a5b9c2105fff81d42c2a7c5b58ac7b62ee2c3b1/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4", size = 23306 }, + { url = "https://files.pythonhosted.org/packages/9a/34/a15aa69f01e2181ed8d2b685c0d2f6655d5cca2c4db0ddea775e631918cd/MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d", size = 15094 }, + { url = "https://files.pythonhosted.org/packages/da/b8/3a3bd761922d416f3dc5d00bfbed11f66b1ab89a0c2b6e887240a30b0f6b/MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b", size = 15521 }, + { url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274 }, + { url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348 }, + { url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149 }, + { url = "https://files.pythonhosted.org/packages/f3/f0/89e7aadfb3749d0f52234a0c8c7867877876e0a20b60e2188e9850794c17/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8", size = 23118 }, + { url = "https://files.pythonhosted.org/packages/d5/da/f2eeb64c723f5e3777bc081da884b414671982008c47dcc1873d81f625b6/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c", size = 22993 }, + { url = "https://files.pythonhosted.org/packages/da/0e/1f32af846df486dce7c227fe0f2398dc7e2e51d4a370508281f3c1c5cddc/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557", size = 24178 }, + { url = "https://files.pythonhosted.org/packages/c4/f6/bb3ca0532de8086cbff5f06d137064c8410d10779c4c127e0e47d17c0b71/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22", size = 23319 }, + { url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352 }, + { url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097 }, + { url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601 }, + { url = "https://files.pythonhosted.org/packages/83/0e/67eb10a7ecc77a0c2bbe2b0235765b98d164d81600746914bebada795e97/MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd", size = 14274 }, + { url = "https://files.pythonhosted.org/packages/2b/6d/9409f3684d3335375d04e5f05744dfe7e9f120062c9857df4ab490a1031a/MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430", size = 12352 }, + { url = "https://files.pythonhosted.org/packages/d2/f5/6eadfcd3885ea85fe2a7c128315cc1bb7241e1987443d78c8fe712d03091/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094", size = 24122 }, + { url = "https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396", size = 23085 }, + { url = "https://files.pythonhosted.org/packages/c2/cf/c9d56af24d56ea04daae7ac0940232d31d5a8354f2b457c6d856b2057d69/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79", size = 22978 }, + { url = "https://files.pythonhosted.org/packages/2a/9f/8619835cd6a711d6272d62abb78c033bda638fdc54c4e7f4272cf1c0962b/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a", size = 24208 }, + { url = "https://files.pythonhosted.org/packages/f9/bf/176950a1792b2cd2102b8ffeb5133e1ed984547b75db47c25a67d3359f77/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca", size = 23357 }, + { url = "https://files.pythonhosted.org/packages/ce/4f/9a02c1d335caabe5c4efb90e1b6e8ee944aa245c1aaaab8e8a618987d816/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c", size = 23344 }, + { url = "https://files.pythonhosted.org/packages/ee/55/c271b57db36f748f0e04a759ace9f8f759ccf22b4960c270c78a394f58be/MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1", size = 15101 }, + { url = "https://files.pythonhosted.org/packages/29/88/07df22d2dd4df40aba9f3e402e6dc1b8ee86297dddbad4872bd5e7b0094f/MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f", size = 15603 }, + { url = "https://files.pythonhosted.org/packages/62/6a/8b89d24db2d32d433dffcd6a8779159da109842434f1dd2f6e71f32f738c/MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c", size = 14510 }, + { url = "https://files.pythonhosted.org/packages/7a/06/a10f955f70a2e5a9bf78d11a161029d278eeacbd35ef806c3fd17b13060d/MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb", size = 12486 }, + { url = "https://files.pythonhosted.org/packages/34/cf/65d4a571869a1a9078198ca28f39fba5fbb910f952f9dbc5220afff9f5e6/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c", size = 25480 }, + { url = "https://files.pythonhosted.org/packages/0c/e3/90e9651924c430b885468b56b3d597cabf6d72be4b24a0acd1fa0e12af67/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d", size = 23914 }, + { url = "https://files.pythonhosted.org/packages/66/8c/6c7cf61f95d63bb866db39085150df1f2a5bd3335298f14a66b48e92659c/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe", size = 23796 }, + { url = "https://files.pythonhosted.org/packages/bb/35/cbe9238ec3f47ac9a7c8b3df7a808e7cb50fe149dc7039f5f454b3fba218/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5", size = 25473 }, + { url = "https://files.pythonhosted.org/packages/e6/32/7621a4382488aa283cc05e8984a9c219abad3bca087be9ec77e89939ded9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a", size = 24114 }, + { url = "https://files.pythonhosted.org/packages/0d/80/0985960e4b89922cb5a0bac0ed39c5b96cbc1a536a99f30e8c220a996ed9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9", size = 24098 }, + { url = "https://files.pythonhosted.org/packages/82/78/fedb03c7d5380df2427038ec8d973587e90561b2d90cd472ce9254cf348b/MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6", size = 15208 }, + { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 }, + { url = "https://files.pythonhosted.org/packages/a7/ea/9b1530c3fdeeca613faeb0fb5cbcf2389d816072fab72a71b45749ef6062/MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a", size = 14344 }, + { url = "https://files.pythonhosted.org/packages/4b/c2/fbdbfe48848e7112ab05e627e718e854d20192b674952d9042ebd8c9e5de/MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff", size = 12389 }, + { url = "https://files.pythonhosted.org/packages/f0/25/7a7c6e4dbd4f867d95d94ca15449e91e52856f6ed1905d58ef1de5e211d0/MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13", size = 21607 }, + { url = "https://files.pythonhosted.org/packages/53/8f/f339c98a178f3c1e545622206b40986a4c3307fe39f70ccd3d9df9a9e425/MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144", size = 20728 }, + { url = "https://files.pythonhosted.org/packages/1a/03/8496a1a78308456dbd50b23a385c69b41f2e9661c67ea1329849a598a8f9/MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29", size = 20826 }, + { url = "https://files.pythonhosted.org/packages/e6/cf/0a490a4bd363048c3022f2f475c8c05582179bb179defcee4766fb3dcc18/MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0", size = 21843 }, + { url = "https://files.pythonhosted.org/packages/19/a3/34187a78613920dfd3cdf68ef6ce5e99c4f3417f035694074beb8848cd77/MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0", size = 21219 }, + { url = "https://files.pythonhosted.org/packages/17/d8/5811082f85bb88410ad7e452263af048d685669bbbfb7b595e8689152498/MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178", size = 20946 }, + { url = "https://files.pythonhosted.org/packages/7c/31/bd635fb5989440d9365c5e3c47556cfea121c7803f5034ac843e8f37c2f2/MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f", size = 15063 }, + { url = "https://files.pythonhosted.org/packages/b3/73/085399401383ce949f727afec55ec3abd76648d04b9f22e1c0e99cb4bec3/MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a", size = 15506 }, +] + +[[package]] +name = "matplotlib-inline" +version = "0.1.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899 }, +] + +[[package]] +name = "maturin" +version = "1.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9a/08/ccb0f917722a35ab0d758be9bb5edaf645c3a3d6170061f10d396ecd273f/maturin-1.8.1.tar.gz", hash = "sha256:49cd964aabf59f8b0a6969f9860d2cdf194ac331529caae14c884f5659568857", size = 197397 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4c/00/f34077315f34db8ad2ccf6bfe11b864ca27baab3a1320634da8e3cf89a48/maturin-1.8.1-py3-none-linux_armv6l.whl", hash = "sha256:7e590a23d9076b8a994f2e67bc63dc9a2d1c9a41b1e7b45ac354ba8275254e89", size = 7568415 }, + { url = "https://files.pythonhosted.org/packages/5c/07/9219976135ce0cb32d2fa6ea5c6d0ad709013d9a17967312e149b98153a6/maturin-1.8.1-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:8d8251a95682c83ea60988c804b620c181911cd824aa107b4a49ac5333c92968", size = 14527816 }, + { url = "https://files.pythonhosted.org/packages/e6/04/fa009a00903acdd1785d58322193140bfe358595347c39f315112dabdf9e/maturin-1.8.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b9fc1a4354cac5e32c190410208039812ea88c4a36bd2b6499268ec49ef5de00", size = 7580446 }, + { url = "https://files.pythonhosted.org/packages/9b/d4/414b2aab9bbfe88182b734d3aa1b4fef7d7701e50f6be48500378b8c8721/maturin-1.8.1-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:621e171c6b39f95f1d0df69a118416034fbd59c0f89dcaea8c2ea62019deecba", size = 7650535 }, + { url = "https://files.pythonhosted.org/packages/f0/64/879418a8a0196013ec1fb19eada0781c04a30e8d6d9227e80f91275a4f5b/maturin-1.8.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:98f638739a5132962347871b85c91f525c9246ef4d99796ae98a2031e3df029f", size = 8006702 }, + { url = "https://files.pythonhosted.org/packages/39/c2/605829324f8371294f70303aca130682df75318958efed246873d3d604ab/maturin-1.8.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:f9f5c47521924b6e515cbc652a042fe5f17f8747445be9d931048e5d8ddb50a4", size = 7368164 }, + { url = "https://files.pythonhosted.org/packages/be/6c/30e136d397bb146b94b628c0ef7f17708281611b97849e2cf37847025ac7/maturin-1.8.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:0f4407c7353c31bfbb8cdeb82bc2170e474cbfb97b5ba27568f440c9d6c1fdd4", size = 7450889 }, + { url = "https://files.pythonhosted.org/packages/1b/50/e1f5023512696d4e56096f702e2f68d6d9a30afe0a4eec82b0e27b8eb4e4/maturin-1.8.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:ec49cd70cad3c389946c6e2bc0bd50772a7fcb463040dd800720345897eec9bf", size = 9585819 }, + { url = "https://files.pythonhosted.org/packages/b7/80/b24b5248d89d2e5982553900237a337ea098ca9297b8369ca2aa95549e0f/maturin-1.8.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08767d794de8f8a11c5c8b1b47a4ff9fb6ae2d2d97679e27030f2f509c8c2a0", size = 10920801 }, + { url = "https://files.pythonhosted.org/packages/6e/f4/8ede7a662fabf93456b44390a5ad22630e25fb5ddaecf787251071b2e143/maturin-1.8.1-py3-none-win32.whl", hash = "sha256:d678407713f3e10df33c5b3d7a343ec0551eb7f14d8ad9ba6febeb96f4e4c75c", size = 6873556 }, + { url = "https://files.pythonhosted.org/packages/9c/22/757f093ed0e319e9648155b8c9d716765442bea5bc98ebc58ad4ad5b0524/maturin-1.8.1-py3-none-win_amd64.whl", hash = "sha256:a526f90fe0e5cb59ffb81f4ff547ddc42e823bbdeae4a31012c0893ca6dcaf46", size = 7823153 }, + { url = "https://files.pythonhosted.org/packages/a4/f5/051413e04f6da25069db5e76759ecdb8cd2a8ab4a94045b5a3bf548c66fa/maturin-1.8.1-py3-none-win_arm64.whl", hash = "sha256:e95f077fd2ddd2f048182880eed458c308571a534be3eb2add4d3dac55bf57f4", size = 6552131 }, +] + +[[package]] +name = "mdit-py-plugins" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/03/a2ecab526543b152300717cf232bb4bb8605b6edb946c845016fa9c9c9fd/mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5", size = 43542 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/f7/7782a043553ee469c1ff49cfa1cdace2d6bf99a1f333cf38676b3ddf30da/mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636", size = 55316 }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 }, +] + +[[package]] +name = "myst-parser" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version == '3.9.*'", + "python_full_version < '3.9'", +] +dependencies = [ + { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "jinja2", marker = "python_full_version < '3.10'" }, + { name = "markdown-it-py", marker = "python_full_version < '3.10'" }, + { name = "mdit-py-plugins", marker = "python_full_version < '3.10'" }, + { name = "pyyaml", marker = "python_full_version < '3.10'" }, + { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/64/e2f13dac02f599980798c01156393b781aec983b52a6e4057ee58f07c43a/myst_parser-3.0.1.tar.gz", hash = "sha256:88f0cb406cb363b077d176b51c476f62d60604d68a8dcdf4832e080441301a87", size = 92392 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e2/de/21aa8394f16add8f7427f0a1326ccd2b3a2a8a3245c9252bc5ac034c6155/myst_parser-3.0.1-py3-none-any.whl", hash = "sha256:6457aaa33a5d474aca678b8ead9b3dc298e89c68e67012e73146ea6fd54babf1", size = 83163 }, +] + +[[package]] +name = "myst-parser" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +dependencies = [ + { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "jinja2", marker = "python_full_version >= '3.10'" }, + { name = "markdown-it-py", marker = "python_full_version >= '3.10'" }, + { name = "mdit-py-plugins", marker = "python_full_version >= '3.10'" }, + { name = "pyyaml", marker = "python_full_version >= '3.10'" }, + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/85/55/6d1741a1780e5e65038b74bce6689da15f620261c490c3511eb4c12bac4b/myst_parser-4.0.0.tar.gz", hash = "sha256:851c9dfb44e36e56d15d05e72f02b80da21a9e0d07cba96baf5e2d476bb91531", size = 93858 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/b4/b036f8fdb667587bb37df29dc6644681dd78b7a2a6321a34684b79412b28/myst_parser-4.0.0-py3-none-any.whl", hash = "sha256:b9317997552424448c6096c2558872fdb6f81d3ecb3a40ce84a7518798f3f28d", size = 84563 }, +] + +[[package]] +name = "numpy" +version = "1.24.4" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/a4/9b/027bec52c633f6556dba6b722d9a0befb40498b9ceddd29cbe67a45a127c/numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463", size = 10911229 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/80/6cdfb3e275d95155a34659163b83c09e3a3ff9f1456880bec6cc63d71083/numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64", size = 19789140 }, + { url = "https://files.pythonhosted.org/packages/64/5f/3f01d753e2175cfade1013eea08db99ba1ee4bdb147ebcf3623b75d12aa7/numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1", size = 13854297 }, + { url = "https://files.pythonhosted.org/packages/5a/b3/2f9c21d799fa07053ffa151faccdceeb69beec5a010576b8991f614021f7/numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4", size = 13995611 }, + { url = "https://files.pythonhosted.org/packages/10/be/ae5bf4737cb79ba437879915791f6f26d92583c738d7d960ad94e5c36adf/numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6", size = 17282357 }, + { url = "https://files.pythonhosted.org/packages/c0/64/908c1087be6285f40e4b3e79454552a701664a079321cff519d8c7051d06/numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc", size = 12429222 }, + { url = "https://files.pythonhosted.org/packages/22/55/3d5a7c1142e0d9329ad27cece17933b0e2ab4e54ddc5c1861fbfeb3f7693/numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e", size = 14841514 }, + { url = "https://files.pythonhosted.org/packages/a9/cc/5ed2280a27e5dab12994c884f1f4d8c3bd4d885d02ae9e52a9d213a6a5e2/numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810", size = 19775508 }, + { url = "https://files.pythonhosted.org/packages/c0/bc/77635c657a3668cf652806210b8662e1aff84b818a55ba88257abf6637a8/numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254", size = 13840033 }, + { url = "https://files.pythonhosted.org/packages/a7/4c/96cdaa34f54c05e97c1c50f39f98d608f96f0677a6589e64e53104e22904/numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7", size = 13991951 }, + { url = "https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5", size = 17278923 }, + { url = "https://files.pythonhosted.org/packages/35/e2/76a11e54139654a324d107da1d98f99e7aa2a7ef97cfd7c631fba7dbde71/numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d", size = 12422446 }, + { url = "https://files.pythonhosted.org/packages/d8/ec/ebef2f7d7c28503f958f0f8b992e7ce606fb74f9e891199329d5f5f87404/numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694", size = 14834466 }, + { url = "https://files.pythonhosted.org/packages/11/10/943cfb579f1a02909ff96464c69893b1d25be3731b5d3652c2e0cf1281ea/numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61", size = 19780722 }, + { url = "https://files.pythonhosted.org/packages/a7/ae/f53b7b265fdc701e663fbb322a8e9d4b14d9cb7b2385f45ddfabfc4327e4/numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f", size = 13843102 }, + { url = "https://files.pythonhosted.org/packages/25/6f/2586a50ad72e8dbb1d8381f837008a0321a3516dfd7cb57fc8cf7e4bb06b/numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e", size = 14039616 }, + { url = "https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc", size = 17316263 }, + { url = "https://files.pythonhosted.org/packages/d1/57/8d328f0b91c733aa9aa7ee540dbc49b58796c862b4fbcb1146c701e888da/numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2", size = 12455660 }, + { url = "https://files.pythonhosted.org/packages/69/65/0d47953afa0ad569d12de5f65d964321c208492064c38fe3b0b9744f8d44/numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706", size = 14868112 }, + { url = "https://files.pythonhosted.org/packages/9a/cd/d5b0402b801c8a8b56b04c1e85c6165efab298d2f0ab741c2406516ede3a/numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400", size = 19816549 }, + { url = "https://files.pythonhosted.org/packages/14/27/638aaa446f39113a3ed38b37a66243e21b38110d021bfcb940c383e120f2/numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f", size = 13879950 }, + { url = "https://files.pythonhosted.org/packages/8f/27/91894916e50627476cff1a4e4363ab6179d01077d71b9afed41d9e1f18bf/numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9", size = 14030228 }, + { url = "https://files.pythonhosted.org/packages/7a/7c/d7b2a0417af6428440c0ad7cb9799073e507b1a465f827d058b826236964/numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d", size = 17311170 }, + { url = "https://files.pythonhosted.org/packages/18/9d/e02ace5d7dfccee796c37b995c63322674daf88ae2f4a4724c5dd0afcc91/numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835", size = 12454918 }, + { url = "https://files.pythonhosted.org/packages/63/38/6cc19d6b8bfa1d1a459daf2b3fe325453153ca7019976274b6f33d8b5663/numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8", size = 14867441 }, + { url = "https://files.pythonhosted.org/packages/a4/fd/8dff40e25e937c94257455c237b9b6bf5a30d42dd1cc11555533be099492/numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef", size = 19156590 }, + { url = "https://files.pythonhosted.org/packages/42/e7/4bf953c6e05df90c6d351af69966384fed8e988d0e8c54dad7103b59f3ba/numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a", size = 16705744 }, + { url = "https://files.pythonhosted.org/packages/fc/dd/9106005eb477d022b60b3817ed5937a43dad8fd1f20b0610ea8a32fcb407/numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2", size = 14734290 }, +] + +[[package]] +name = "numpy" +version = "2.0.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245 }, + { url = "https://files.pythonhosted.org/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540 }, + { url = "https://files.pythonhosted.org/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623 }, + { url = "https://files.pythonhosted.org/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774 }, + { url = "https://files.pythonhosted.org/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081 }, + { url = "https://files.pythonhosted.org/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451 }, + { url = "https://files.pythonhosted.org/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572 }, + { url = "https://files.pythonhosted.org/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722 }, + { url = "https://files.pythonhosted.org/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170 }, + { url = "https://files.pythonhosted.org/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558 }, + { url = "https://files.pythonhosted.org/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137 }, + { url = "https://files.pythonhosted.org/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552 }, + { url = "https://files.pythonhosted.org/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957 }, + { url = "https://files.pythonhosted.org/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573 }, + { url = "https://files.pythonhosted.org/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330 }, + { url = "https://files.pythonhosted.org/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895 }, + { url = "https://files.pythonhosted.org/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253 }, + { url = "https://files.pythonhosted.org/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074 }, + { url = "https://files.pythonhosted.org/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640 }, + { url = "https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230 }, + { url = "https://files.pythonhosted.org/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803 }, + { url = "https://files.pythonhosted.org/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835 }, + { url = "https://files.pythonhosted.org/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499 }, + { url = "https://files.pythonhosted.org/packages/e3/ff/ddf6dac2ff0dd50a7327bcdba45cb0264d0e96bb44d33324853f781a8f3c/numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c", size = 6633497 }, + { url = "https://files.pythonhosted.org/packages/72/21/67f36eac8e2d2cd652a2e69595a54128297cdcb1ff3931cfc87838874bd4/numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692", size = 13621158 }, + { url = "https://files.pythonhosted.org/packages/39/68/e9f1126d757653496dbc096cb429014347a36b228f5a991dae2c6b6cfd40/numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a", size = 19236173 }, + { url = "https://files.pythonhosted.org/packages/d1/e9/1f5333281e4ebf483ba1c888b1d61ba7e78d7e910fdd8e6499667041cc35/numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c", size = 19634174 }, + { url = "https://files.pythonhosted.org/packages/71/af/a469674070c8d8408384e3012e064299f7a2de540738a8e414dcfd639996/numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded", size = 14099701 }, + { url = "https://files.pythonhosted.org/packages/d0/3d/08ea9f239d0e0e939b6ca52ad403c84a2bce1bde301a8eb4888c1c1543f1/numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5", size = 6174313 }, + { url = "https://files.pythonhosted.org/packages/b2/b5/4ac39baebf1fdb2e72585c8352c56d063b6126be9fc95bd2bb5ef5770c20/numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a", size = 15606179 }, + { url = "https://files.pythonhosted.org/packages/43/c1/41c8f6df3162b0c6ffd4437d729115704bd43363de0090c7f913cfbc2d89/numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c", size = 21169942 }, + { url = "https://files.pythonhosted.org/packages/39/bc/fd298f308dcd232b56a4031fd6ddf11c43f9917fbc937e53762f7b5a3bb1/numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd", size = 13711512 }, + { url = "https://files.pythonhosted.org/packages/96/ff/06d1aa3eeb1c614eda245c1ba4fb88c483bee6520d361641331872ac4b82/numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b", size = 5306976 }, + { url = "https://files.pythonhosted.org/packages/2d/98/121996dcfb10a6087a05e54453e28e58694a7db62c5a5a29cee14c6e047b/numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729", size = 6906494 }, + { url = "https://files.pythonhosted.org/packages/15/31/9dffc70da6b9bbf7968f6551967fc21156207366272c2a40b4ed6008dc9b/numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1", size = 13912596 }, + { url = "https://files.pythonhosted.org/packages/b9/14/78635daab4b07c0930c919d451b8bf8c164774e6a3413aed04a6d95758ce/numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd", size = 19526099 }, + { url = "https://files.pythonhosted.org/packages/26/4c/0eeca4614003077f68bfe7aac8b7496f04221865b3a5e7cb230c9d055afd/numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d", size = 19932823 }, + { url = "https://files.pythonhosted.org/packages/f1/46/ea25b98b13dccaebddf1a803f8c748680d972e00507cd9bc6dcdb5aa2ac1/numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d", size = 14404424 }, + { url = "https://files.pythonhosted.org/packages/c8/a6/177dd88d95ecf07e722d21008b1b40e681a929eb9e329684d449c36586b2/numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa", size = 6476809 }, + { url = "https://files.pythonhosted.org/packages/ea/2b/7fc9f4e7ae5b507c1a3a21f0f15ed03e794c1242ea8a242ac158beb56034/numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73", size = 15911314 }, + { url = "https://files.pythonhosted.org/packages/8f/3b/df5a870ac6a3be3a86856ce195ef42eec7ae50d2a202be1f5a4b3b340e14/numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8", size = 21025288 }, + { url = "https://files.pythonhosted.org/packages/2c/97/51af92f18d6f6f2d9ad8b482a99fb74e142d71372da5d834b3a2747a446e/numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4", size = 6762793 }, + { url = "https://files.pythonhosted.org/packages/12/46/de1fbd0c1b5ccaa7f9a005b66761533e2f6a3e560096682683a223631fe9/numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c", size = 19334885 }, + { url = "https://files.pythonhosted.org/packages/cc/dc/d330a6faefd92b446ec0f0dfea4c3207bb1fef3c4771d19cf4543efd2c78/numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385", size = 15828784 }, +] + +[[package]] +name = "numpy" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/f2/a5/fdbf6a7871703df6160b5cf3dd774074b086d278172285c52c2758b76305/numpy-2.2.1.tar.gz", hash = "sha256:45681fd7128c8ad1c379f0ca0776a8b0c6583d2f69889ddac01559dfe4390918", size = 20227662 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/c4/5588367dc9f91e1a813beb77de46ea8cab13f778e1b3a0e661ab031aba44/numpy-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5edb4e4caf751c1518e6a26a83501fda79bff41cc59dac48d70e6d65d4ec4440", size = 21213214 }, + { url = "https://files.pythonhosted.org/packages/d8/8b/32dd9f08419023a4cf856c5ad0b4eba9b830da85eafdef841a104c4fc05a/numpy-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa3017c40d513ccac9621a2364f939d39e550c542eb2a894b4c8da92b38896ab", size = 14352248 }, + { url = "https://files.pythonhosted.org/packages/84/2d/0e895d02940ba6e12389f0ab5cac5afcf8dc2dc0ade4e8cad33288a721bd/numpy-2.2.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:61048b4a49b1c93fe13426e04e04fdf5a03f456616f6e98c7576144677598675", size = 5391007 }, + { url = "https://files.pythonhosted.org/packages/11/b9/7f1e64a0d46d9c2af6d17966f641fb12d5b8ea3003f31b2308f3e3b9a6aa/numpy-2.2.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:7671dc19c7019103ca44e8d94917eba8534c76133523ca8406822efdd19c9308", size = 6926174 }, + { url = "https://files.pythonhosted.org/packages/2e/8c/043fa4418bc9364e364ab7aba8ff6ef5f6b9171ade22de8fbcf0e2fa4165/numpy-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4250888bcb96617e00bfa28ac24850a83c9f3a16db471eca2ee1f1714df0f957", size = 14330914 }, + { url = "https://files.pythonhosted.org/packages/f7/b6/d8110985501ca8912dfc1c3bbef99d66e62d487f72e46b2337494df77364/numpy-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7746f235c47abc72b102d3bce9977714c2444bdfaea7888d241b4c4bb6a78bf", size = 16379607 }, + { url = "https://files.pythonhosted.org/packages/e2/57/bdca9fb8bdaa810c3a4ff2eb3231379b77f618a7c0d24be9f7070db50775/numpy-2.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:059e6a747ae84fce488c3ee397cee7e5f905fd1bda5fb18c66bc41807ff119b2", size = 15541760 }, + { url = "https://files.pythonhosted.org/packages/97/55/3b9147b3cbc3b6b1abc2a411dec5337a46c873deca0dd0bf5bef9d0579cc/numpy-2.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f62aa6ee4eb43b024b0e5a01cf65a0bb078ef8c395e8713c6e8a12a697144528", size = 18168476 }, + { url = "https://files.pythonhosted.org/packages/00/e7/7c2cde16c9b87a8e14fdd262ca7849c4681cf48c8a774505f7e6f5e3b643/numpy-2.2.1-cp310-cp310-win32.whl", hash = "sha256:48fd472630715e1c1c89bf1feab55c29098cb403cc184b4859f9c86d4fcb6a95", size = 6570985 }, + { url = "https://files.pythonhosted.org/packages/a1/a8/554b0e99fc4ac11ec481254781a10da180d0559c2ebf2c324232317349ee/numpy-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:b541032178a718c165a49638d28272b771053f628382d5e9d1c93df23ff58dbf", size = 12913384 }, + { url = "https://files.pythonhosted.org/packages/59/14/645887347124e101d983e1daf95b48dc3e136bf8525cb4257bf9eab1b768/numpy-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:40f9e544c1c56ba8f1cf7686a8c9b5bb249e665d40d626a23899ba6d5d9e1484", size = 21217379 }, + { url = "https://files.pythonhosted.org/packages/9f/fd/2279000cf29f58ccfd3778cbf4670dfe3f7ce772df5e198c5abe9e88b7d7/numpy-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9b57eaa3b0cd8db52049ed0330747b0364e899e8a606a624813452b8203d5f7", size = 14388520 }, + { url = "https://files.pythonhosted.org/packages/58/b0/034eb5d5ba12d66ab658ff3455a31f20add0b78df8203c6a7451bd1bee21/numpy-2.2.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bc8a37ad5b22c08e2dbd27df2b3ef7e5c0864235805b1e718a235bcb200cf1cb", size = 5389286 }, + { url = "https://files.pythonhosted.org/packages/5d/69/6f3cccde92e82e7835fdb475c2bf439761cbf8a1daa7c07338e1e132dfec/numpy-2.2.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:9036d6365d13b6cbe8f27a0eaf73ddcc070cae584e5ff94bb45e3e9d729feab5", size = 6930345 }, + { url = "https://files.pythonhosted.org/packages/d1/72/1cd38e91ab563e67f584293fcc6aca855c9ae46dba42e6b5ff4600022899/numpy-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51faf345324db860b515d3f364eaa93d0e0551a88d6218a7d61286554d190d73", size = 14335748 }, + { url = "https://files.pythonhosted.org/packages/f2/d4/f999444e86986f3533e7151c272bd8186c55dda554284def18557e013a2a/numpy-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38efc1e56b73cc9b182fe55e56e63b044dd26a72128fd2fbd502f75555d92591", size = 16391057 }, + { url = "https://files.pythonhosted.org/packages/99/7b/85cef6a3ae1b19542b7afd97d0b296526b6ef9e3c43ea0c4d9c4404fb2d0/numpy-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:31b89fa67a8042e96715c68e071a1200c4e172f93b0fbe01a14c0ff3ff820fc8", size = 15556943 }, + { url = "https://files.pythonhosted.org/packages/69/7e/b83cc884c3508e91af78760f6b17ab46ad649831b1fa35acb3eb26d9e6d2/numpy-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c86e2a209199ead7ee0af65e1d9992d1dce7e1f63c4b9a616500f93820658d0", size = 18180785 }, + { url = "https://files.pythonhosted.org/packages/b2/9f/eb4a9a38867de059dcd4b6e18d47c3867fbd3795d4c9557bb49278f94087/numpy-2.2.1-cp311-cp311-win32.whl", hash = "sha256:b34d87e8a3090ea626003f87f9392b3929a7bbf4104a05b6667348b6bd4bf1cd", size = 6568983 }, + { url = "https://files.pythonhosted.org/packages/6d/1e/be3b9f3073da2f8c7fa361fcdc231b548266b0781029fdbaf75eeab997fd/numpy-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:360137f8fb1b753c5cde3ac388597ad680eccbbbb3865ab65efea062c4a1fd16", size = 12917260 }, + { url = "https://files.pythonhosted.org/packages/62/12/b928871c570d4a87ab13d2cc19f8817f17e340d5481621930e76b80ffb7d/numpy-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:694f9e921a0c8f252980e85bce61ebbd07ed2b7d4fa72d0e4246f2f8aa6642ab", size = 20909861 }, + { url = "https://files.pythonhosted.org/packages/3d/c3/59df91ae1d8ad7c5e03efd63fd785dec62d96b0fe56d1f9ab600b55009af/numpy-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3683a8d166f2692664262fd4900f207791d005fb088d7fdb973cc8d663626faa", size = 14095776 }, + { url = "https://files.pythonhosted.org/packages/af/4e/8ed5868efc8e601fb69419644a280e9c482b75691466b73bfaab7d86922c/numpy-2.2.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:780077d95eafc2ccc3ced969db22377b3864e5b9a0ea5eb347cc93b3ea900315", size = 5126239 }, + { url = "https://files.pythonhosted.org/packages/1a/74/dd0bbe650d7bc0014b051f092f2de65e34a8155aabb1287698919d124d7f/numpy-2.2.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:55ba24ebe208344aa7a00e4482f65742969a039c2acfcb910bc6fcd776eb4355", size = 6659296 }, + { url = "https://files.pythonhosted.org/packages/7f/11/4ebd7a3f4a655764dc98481f97bd0a662fb340d1001be6050606be13e162/numpy-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b1d07b53b78bf84a96898c1bc139ad7f10fda7423f5fd158fd0f47ec5e01ac7", size = 14047121 }, + { url = "https://files.pythonhosted.org/packages/7f/a7/c1f1d978166eb6b98ad009503e4d93a8c1962d0eb14a885c352ee0276a54/numpy-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5062dc1a4e32a10dc2b8b13cedd58988261416e811c1dc4dbdea4f57eea61b0d", size = 16096599 }, + { url = "https://files.pythonhosted.org/packages/3d/6d/0e22afd5fcbb4d8d0091f3f46bf4e8906399c458d4293da23292c0ba5022/numpy-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fce4f615f8ca31b2e61aa0eb5865a21e14f5629515c9151850aa936c02a1ee51", size = 15243932 }, + { url = "https://files.pythonhosted.org/packages/03/39/e4e5832820131ba424092b9610d996b37e5557180f8e2d6aebb05c31ae54/numpy-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:67d4cda6fa6ffa073b08c8372aa5fa767ceb10c9a0587c707505a6d426f4e046", size = 17861032 }, + { url = "https://files.pythonhosted.org/packages/5f/8a/3794313acbf5e70df2d5c7d2aba8718676f8d054a05abe59e48417fb2981/numpy-2.2.1-cp312-cp312-win32.whl", hash = "sha256:32cb94448be47c500d2c7a95f93e2f21a01f1fd05dd2beea1ccd049bb6001cd2", size = 6274018 }, + { url = "https://files.pythonhosted.org/packages/17/c1/c31d3637f2641e25c7a19adf2ae822fdaf4ddd198b05d79a92a9ce7cb63e/numpy-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:ba5511d8f31c033a5fcbda22dd5c813630af98c70b2661f2d2c654ae3cdfcfc8", size = 12613843 }, + { url = "https://files.pythonhosted.org/packages/20/d6/91a26e671c396e0c10e327b763485ee295f5a5a7a48c553f18417e5a0ed5/numpy-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f1d09e520217618e76396377c81fba6f290d5f926f50c35f3a5f72b01a0da780", size = 20896464 }, + { url = "https://files.pythonhosted.org/packages/8c/40/5792ccccd91d45e87d9e00033abc4f6ca8a828467b193f711139ff1f1cd9/numpy-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3ecc47cd7f6ea0336042be87d9e7da378e5c7e9b3c8ad0f7c966f714fc10d821", size = 14111350 }, + { url = "https://files.pythonhosted.org/packages/c0/2a/fb0a27f846cb857cef0c4c92bef89f133a3a1abb4e16bba1c4dace2e9b49/numpy-2.2.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f419290bc8968a46c4933158c91a0012b7a99bb2e465d5ef5293879742f8797e", size = 5111629 }, + { url = "https://files.pythonhosted.org/packages/eb/e5/8e81bb9d84db88b047baf4e8b681a3e48d6390bc4d4e4453eca428ecbb49/numpy-2.2.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5b6c390bfaef8c45a260554888966618328d30e72173697e5cabe6b285fb2348", size = 6645865 }, + { url = "https://files.pythonhosted.org/packages/7a/1a/a90ceb191dd2f9e2897c69dde93ccc2d57dd21ce2acbd7b0333e8eea4e8d/numpy-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:526fc406ab991a340744aad7e25251dd47a6720a685fa3331e5c59fef5282a59", size = 14043508 }, + { url = "https://files.pythonhosted.org/packages/f1/5a/e572284c86a59dec0871a49cd4e5351e20b9c751399d5f1d79628c0542cb/numpy-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f74e6fdeb9a265624ec3a3918430205dff1df7e95a230779746a6af78bc615af", size = 16094100 }, + { url = "https://files.pythonhosted.org/packages/0c/2c/a79d24f364788386d85899dd280a94f30b0950be4b4a545f4fa4ed1d4ca7/numpy-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:53c09385ff0b72ba79d8715683c1168c12e0b6e84fb0372e97553d1ea91efe51", size = 15239691 }, + { url = "https://files.pythonhosted.org/packages/cf/79/1e20fd1c9ce5a932111f964b544facc5bb9bde7865f5b42f00b4a6a9192b/numpy-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f3eac17d9ec51be534685ba877b6ab5edc3ab7ec95c8f163e5d7b39859524716", size = 17856571 }, + { url = "https://files.pythonhosted.org/packages/be/5b/cc155e107f75d694f562bdc84a26cc930569f3dfdfbccb3420b626065777/numpy-2.2.1-cp313-cp313-win32.whl", hash = "sha256:9ad014faa93dbb52c80d8f4d3dcf855865c876c9660cb9bd7553843dd03a4b1e", size = 6270841 }, + { url = "https://files.pythonhosted.org/packages/44/be/0e5cd009d2162e4138d79a5afb3b5d2341f0fe4777ab6e675aa3d4a42e21/numpy-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:164a829b6aacf79ca47ba4814b130c4020b202522a93d7bff2202bfb33b61c60", size = 12606618 }, + { url = "https://files.pythonhosted.org/packages/a8/87/04ddf02dd86fb17c7485a5f87b605c4437966d53de1e3745d450343a6f56/numpy-2.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4dfda918a13cc4f81e9118dea249e192ab167a0bb1966272d5503e39234d694e", size = 20921004 }, + { url = "https://files.pythonhosted.org/packages/6e/3e/d0e9e32ab14005425d180ef950badf31b862f3839c5b927796648b11f88a/numpy-2.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:733585f9f4b62e9b3528dd1070ec4f52b8acf64215b60a845fa13ebd73cd0712", size = 14119910 }, + { url = "https://files.pythonhosted.org/packages/b5/5b/aa2d1905b04a8fb681e08742bb79a7bddfc160c7ce8e1ff6d5c821be0236/numpy-2.2.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:89b16a18e7bba224ce5114db863e7029803c179979e1af6ad6a6b11f70545008", size = 5153612 }, + { url = "https://files.pythonhosted.org/packages/ce/35/6831808028df0648d9b43c5df7e1051129aa0d562525bacb70019c5f5030/numpy-2.2.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:676f4eebf6b2d430300f1f4f4c2461685f8269f94c89698d832cdf9277f30b84", size = 6668401 }, + { url = "https://files.pythonhosted.org/packages/b1/38/10ef509ad63a5946cc042f98d838daebfe7eaf45b9daaf13df2086b15ff9/numpy-2.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f5cdf9f493b35f7e41e8368e7d7b4bbafaf9660cba53fb21d2cd174ec09631", size = 14014198 }, + { url = "https://files.pythonhosted.org/packages/df/f8/c80968ae01df23e249ee0a4487fae55a4c0fe2f838dfe9cc907aa8aea0fa/numpy-2.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1ad395cf254c4fbb5b2132fee391f361a6e8c1adbd28f2cd8e79308a615fe9d", size = 16076211 }, + { url = "https://files.pythonhosted.org/packages/09/69/05c169376016a0b614b432967ac46ff14269eaffab80040ec03ae1ae8e2c/numpy-2.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:08ef779aed40dbc52729d6ffe7dd51df85796a702afbf68a4f4e41fafdc8bda5", size = 15220266 }, + { url = "https://files.pythonhosted.org/packages/f1/ff/94a4ce67ea909f41cf7ea712aebbe832dc67decad22944a1020bb398a5ee/numpy-2.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:26c9c4382b19fcfbbed3238a14abf7ff223890ea1936b8890f058e7ba35e8d71", size = 17852844 }, + { url = "https://files.pythonhosted.org/packages/46/72/8a5dbce4020dfc595592333ef2fbb0a187d084ca243b67766d29d03e0096/numpy-2.2.1-cp313-cp313t-win32.whl", hash = "sha256:93cf4e045bae74c90ca833cba583c14b62cb4ba2cba0abd2b141ab52548247e2", size = 6326007 }, + { url = "https://files.pythonhosted.org/packages/7b/9c/4fce9cf39dde2562584e4cfd351a0140240f82c0e3569ce25a250f47037d/numpy-2.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bff7d8ec20f5f42607599f9994770fa65d76edca264a87b5e4ea5629bce12268", size = 12693107 }, + { url = "https://files.pythonhosted.org/packages/f1/65/d36a76b811ffe0a4515e290cb05cb0e22171b1b0f0db6bee9141cf023545/numpy-2.2.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7ba9cc93a91d86365a5d270dee221fdc04fb68d7478e6bf6af650de78a8339e3", size = 21044672 }, + { url = "https://files.pythonhosted.org/packages/aa/3f/b644199f165063154df486d95198d814578f13dd4d8c1651e075bf1cb8af/numpy-2.2.1-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:3d03883435a19794e41f147612a77a8f56d4e52822337844fff3d4040a142964", size = 6789873 }, + { url = "https://files.pythonhosted.org/packages/d7/df/2adb0bb98a3cbe8a6c3c6d1019aede1f1d8b83927ced228a46cc56c7a206/numpy-2.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4511d9e6071452b944207c8ce46ad2f897307910b402ea5fa975da32e0102800", size = 16194933 }, + { url = "https://files.pythonhosted.org/packages/13/3e/1959d5219a9e6d200638d924cedda6a606392f7186a4ed56478252e70d55/numpy-2.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5c5cc0cbabe9452038ed984d05ac87910f89370b9242371bd9079cb4af61811e", size = 12820057 }, +] + +[[package]] +name = "packaging" +version = "24.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, +] + +[[package]] +name = "pandas" +version = "2.0.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +dependencies = [ + { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "python-dateutil", marker = "python_full_version < '3.9'" }, + { name = "pytz", marker = "python_full_version < '3.9'" }, + { name = "tzdata", marker = "python_full_version < '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/a7/824332581e258b5aa4f3763ecb2a797e5f9a54269044ba2e50ac19936b32/pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c", size = 5284455 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/b2/0d4a5729ce1ce11630c4fc5d5522a33b967b3ca146c210f58efde7c40e99/pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8", size = 11760908 }, + { url = "https://files.pythonhosted.org/packages/4a/f6/f620ca62365d83e663a255a41b08d2fc2eaf304e0b8b21bb6d62a7390fe3/pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f", size = 10823486 }, + { url = "https://files.pythonhosted.org/packages/c2/59/cb4234bc9b968c57e81861b306b10cd8170272c57b098b724d3de5eda124/pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183", size = 11571897 }, + { url = "https://files.pythonhosted.org/packages/e3/59/35a2892bf09ded9c1bf3804461efe772836a5261ef5dfb4e264ce813ff99/pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0", size = 12306421 }, + { url = "https://files.pythonhosted.org/packages/94/71/3a0c25433c54bb29b48e3155b959ac78f4c4f2f06f94d8318aac612cb80f/pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210", size = 9540792 }, + { url = "https://files.pythonhosted.org/packages/ed/30/b97456e7063edac0e5a405128065f0cd2033adfe3716fb2256c186bd41d0/pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e", size = 10664333 }, + { url = "https://files.pythonhosted.org/packages/b3/92/a5e5133421b49e901a12e02a6a7ef3a0130e10d13db8cb657fdd0cba3b90/pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8", size = 11645672 }, + { url = "https://files.pythonhosted.org/packages/8f/bb/aea1fbeed5b474cb8634364718abe9030d7cc7a30bf51f40bd494bbc89a2/pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26", size = 10693229 }, + { url = "https://files.pythonhosted.org/packages/d6/90/e7d387f1a416b14e59290baa7a454a90d719baebbf77433ff1bdcc727800/pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d", size = 11581591 }, + { url = "https://files.pythonhosted.org/packages/d0/28/88b81881c056376254618fad622a5e94b5126db8c61157ea1910cd1c040a/pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df", size = 12219370 }, + { url = "https://files.pythonhosted.org/packages/e4/a5/212b9039e25bf8ebb97e417a96660e3dc925dacd3f8653d531b8f7fd9be4/pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd", size = 9482935 }, + { url = "https://files.pythonhosted.org/packages/9e/71/756a1be6bee0209d8c0d8c5e3b9fc72c00373f384a4017095ec404aec3ad/pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b", size = 10607692 }, + { url = "https://files.pythonhosted.org/packages/78/a8/07dd10f90ca915ed914853cd57f79bfc22e1ef4384ab56cb4336d2fc1f2a/pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061", size = 11653303 }, + { url = "https://files.pythonhosted.org/packages/53/c3/f8e87361f7fdf42012def602bfa2a593423c729f5cb7c97aed7f51be66ac/pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5", size = 10710932 }, + { url = "https://files.pythonhosted.org/packages/a7/87/828d50c81ce0f434163bf70b925a0eec6076808e0bca312a79322b141f66/pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089", size = 11684018 }, + { url = "https://files.pythonhosted.org/packages/f8/7f/5b047effafbdd34e52c9e2d7e44f729a0655efafb22198c45cf692cdc157/pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0", size = 12353723 }, + { url = "https://files.pythonhosted.org/packages/ea/ae/26a2eda7fa581347d69e51f93892493b2074ef3352ac71033c9f32c52389/pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02", size = 9646403 }, + { url = "https://files.pythonhosted.org/packages/c3/6c/ea362eef61f05553aaf1a24b3e96b2d0603f5dc71a3bd35688a24ed88843/pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78", size = 10777638 }, + { url = "https://files.pythonhosted.org/packages/f8/c7/cfef920b7b457dff6928e824896cb82367650ea127d048ee0b820026db4f/pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b", size = 11834160 }, + { url = "https://files.pythonhosted.org/packages/6c/1c/689c9d99bc4e5d366a5fd871f0bcdee98a6581e240f96b78d2d08f103774/pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e", size = 10862752 }, + { url = "https://files.pythonhosted.org/packages/cc/b8/4d082f41c27c95bf90485d1447b647cc7e5680fea75e315669dc6e4cb398/pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b", size = 11715852 }, + { url = "https://files.pythonhosted.org/packages/9e/0d/91a9fd2c202f2b1d97a38ab591890f86480ecbb596cbc56d035f6f23fdcc/pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641", size = 12398496 }, + { url = "https://files.pythonhosted.org/packages/26/7d/d8aa0a2c4f3f5f8ea59fb946c8eafe8f508090ca73e2b08a9af853c1103e/pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682", size = 9630766 }, + { url = "https://files.pythonhosted.org/packages/9a/f2/0ad053856debbe90c83de1b4f05915f85fd2146f20faf9daa3b320d36df3/pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc", size = 10755902 }, +] + +[[package]] +name = "pandas" +version = "2.2.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +dependencies = [ + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "python-dateutil", marker = "python_full_version >= '3.9'" }, + { name = "pytz", marker = "python_full_version >= '3.9'" }, + { name = "tzdata", marker = "python_full_version >= '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827 }, + { url = "https://files.pythonhosted.org/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897 }, + { url = "https://files.pythonhosted.org/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908 }, + { url = "https://files.pythonhosted.org/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210 }, + { url = "https://files.pythonhosted.org/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292 }, + { url = "https://files.pythonhosted.org/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379 }, + { url = "https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471 }, + { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222 }, + { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274 }, + { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836 }, + { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505 }, + { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420 }, + { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457 }, + { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166 }, + { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 }, + { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 }, + { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 }, + { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 }, + { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 }, + { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 }, + { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 }, + { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 }, + { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 }, + { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 }, + { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 }, + { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 }, + { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 }, + { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 }, + { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 }, + { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 }, + { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 }, + { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 }, + { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 }, + { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 }, + { url = "https://files.pythonhosted.org/packages/ca/8c/8848a4c9b8fdf5a534fe2077af948bf53cd713d77ffbcd7bd15710348fd7/pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39", size = 12595535 }, + { url = "https://files.pythonhosted.org/packages/9c/b9/5cead4f63b6d31bdefeb21a679bc5a7f4aaf262ca7e07e2bc1c341b68470/pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30", size = 11319822 }, + { url = "https://files.pythonhosted.org/packages/31/af/89e35619fb573366fa68dc26dad6ad2c08c17b8004aad6d98f1a31ce4bb3/pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c", size = 15625439 }, + { url = "https://files.pythonhosted.org/packages/3d/dd/bed19c2974296661493d7acc4407b1d2db4e2a482197df100f8f965b6225/pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c", size = 13068928 }, + { url = "https://files.pythonhosted.org/packages/31/a3/18508e10a31ea108d746c848b5a05c0711e0278fa0d6f1c52a8ec52b80a5/pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea", size = 16783266 }, + { url = "https://files.pythonhosted.org/packages/c4/a5/3429bd13d82bebc78f4d78c3945efedef63a7cd0c15c17b2eeb838d1121f/pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761", size = 14450871 }, + { url = "https://files.pythonhosted.org/packages/2f/49/5c30646e96c684570925b772eac4eb0a8cb0ca590fa978f56c5d3ae73ea1/pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e", size = 11618011 }, +] + +[[package]] +name = "parso" +version = "0.8.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650 }, +] + +[[package]] +name = "pexpect" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ptyprocess" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772 }, +] + +[[package]] +name = "pickleshare" +version = "0.7.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/b6/df3c1c9b616e9c0edbc4fbab6ddd09df9535849c64ba51fcb6531c32d4d8/pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", size = 6161 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/41/220f49aaea88bc6fa6cba8d05ecf24676326156c23b991e80b3f2fc24c77/pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56", size = 6877 }, +] + +[[package]] +name = "pluggy" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, +] + +[[package]] +name = "prompt-toolkit" +version = "3.0.48" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2d/4f/feb5e137aff82f7c7f3248267b97451da3644f6cdc218edfe549fb354127/prompt_toolkit-3.0.48.tar.gz", hash = "sha256:d6623ab0477a80df74e646bdbc93621143f5caf104206aa29294d53de1a03d90", size = 424684 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl", hash = "sha256:f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e", size = 386595 }, +] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993 }, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 }, +] + +[[package]] +name = "pyarrow" +version = "17.0.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +dependencies = [ + { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/4e/ea6d43f324169f8aec0e57569443a38bab4b398d09769ca64f7b4d467de3/pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28", size = 1112479 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/5d/78d4b040bc5ff2fc6c3d03e80fca396b742f6c125b8af06bcf7427f931bc/pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07", size = 28994846 }, + { url = "https://files.pythonhosted.org/packages/3b/73/8ed168db7642e91180330e4ea9f3ff8bab404678f00d32d7df0871a4933b/pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655", size = 27165908 }, + { url = "https://files.pythonhosted.org/packages/81/36/e78c24be99242063f6d0590ef68c857ea07bdea470242c361e9a15bd57a4/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545", size = 39264209 }, + { url = "https://files.pythonhosted.org/packages/18/4c/3db637d7578f683b0a8fb8999b436bdbedd6e3517bd4f90c70853cf3ad20/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2", size = 39862883 }, + { url = "https://files.pythonhosted.org/packages/81/3c/0580626896c842614a523e66b351181ed5bb14e5dfc263cd68cea2c46d90/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8", size = 38723009 }, + { url = "https://files.pythonhosted.org/packages/ee/fb/c1b47f0ada36d856a352da261a44d7344d8f22e2f7db3945f8c3b81be5dd/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047", size = 39855626 }, + { url = "https://files.pythonhosted.org/packages/19/09/b0a02908180a25d57312ab5919069c39fddf30602568980419f4b02393f6/pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087", size = 25147242 }, + { url = "https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977", size = 29028748 }, + { url = "https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3", size = 27190965 }, + { url = "https://files.pythonhosted.org/packages/3b/c8/5675719570eb1acd809481c6d64e2136ffb340bc387f4ca62dce79516cea/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15", size = 39269081 }, + { url = "https://files.pythonhosted.org/packages/5e/78/3931194f16ab681ebb87ad252e7b8d2c8b23dad49706cadc865dff4a1dd3/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597", size = 39864921 }, + { url = "https://files.pythonhosted.org/packages/d8/81/69b6606093363f55a2a574c018901c40952d4e902e670656d18213c71ad7/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420", size = 38740798 }, + { url = "https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4", size = 39871877 }, + { url = "https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03", size = 25151089 }, + { url = "https://files.pythonhosted.org/packages/d4/62/ce6ac1275a432b4a27c55fe96c58147f111d8ba1ad800a112d31859fae2f/pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22", size = 29019418 }, + { url = "https://files.pythonhosted.org/packages/8e/0a/dbd0c134e7a0c30bea439675cc120012337202e5fac7163ba839aa3691d2/pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053", size = 27152197 }, + { url = "https://files.pythonhosted.org/packages/cb/05/3f4a16498349db79090767620d6dc23c1ec0c658a668d61d76b87706c65d/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a", size = 39263026 }, + { url = "https://files.pythonhosted.org/packages/c2/0c/ea2107236740be8fa0e0d4a293a095c9f43546a2465bb7df34eee9126b09/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc", size = 39880798 }, + { url = "https://files.pythonhosted.org/packages/f6/b0/b9164a8bc495083c10c281cc65064553ec87b7537d6f742a89d5953a2a3e/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a", size = 38715172 }, + { url = "https://files.pythonhosted.org/packages/f1/c4/9625418a1413005e486c006e56675334929fad864347c5ae7c1b2e7fe639/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b", size = 39874508 }, + { url = "https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7", size = 25099235 }, + { url = "https://files.pythonhosted.org/packages/8d/bd/8f52c1d7b430260f80a349cffa2df351750a737b5336313d56dcadeb9ae1/pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204", size = 28999345 }, + { url = "https://files.pythonhosted.org/packages/64/d9/51e35550f2f18b8815a2ab25948f735434db32000c0e91eba3a32634782a/pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8", size = 27168441 }, + { url = "https://files.pythonhosted.org/packages/18/d8/7161d87d07ea51be70c49f615004c1446d5723622a18b2681f7e4b71bf6e/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155", size = 39363163 }, + { url = "https://files.pythonhosted.org/packages/3f/08/bc497130789833de09e345e3ce4647e3ce86517c4f70f2144f0367ca378b/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145", size = 39965253 }, + { url = "https://files.pythonhosted.org/packages/d3/2e/493dd7db889402b4c7871ca7dfdd20f2c5deedbff802d3eb8576359930f9/pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c", size = 38805378 }, + { url = "https://files.pythonhosted.org/packages/e6/c1/4c6bcdf7a820034aa91a8b4d25fef38809be79b42ca7aaa16d4680b0bbac/pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c", size = 39958364 }, + { url = "https://files.pythonhosted.org/packages/d1/db/42ac644453cfdfc60fe002b46d647fe7a6dfad753ef7b28e99b4c936ad5d/pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca", size = 25229211 }, + { url = "https://files.pythonhosted.org/packages/43/e0/a898096d35be240aa61fb2d54db58b86d664b10e1e51256f9300f47565e8/pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb", size = 29007881 }, + { url = "https://files.pythonhosted.org/packages/59/22/f7d14907ed0697b5dd488d393129f2738629fa5bcba863e00931b7975946/pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df", size = 27178117 }, + { url = "https://files.pythonhosted.org/packages/bf/ee/661211feac0ed48467b1d5c57298c91403809ec3ab78b1d175e1d6ad03cf/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687", size = 39273896 }, + { url = "https://files.pythonhosted.org/packages/af/61/bcd9b58e38ead6ad42b9ed00da33a3f862bc1d445e3d3164799c25550ac2/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b", size = 39875438 }, + { url = "https://files.pythonhosted.org/packages/75/63/29d1bfcc57af73cde3fc3baccab2f37548de512dbe0ab294b033cd203516/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5", size = 38735092 }, + { url = "https://files.pythonhosted.org/packages/39/f4/90258b4de753df7cc61cefb0312f8abcf226672e96cc64996e66afce817a/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda", size = 39867610 }, + { url = "https://files.pythonhosted.org/packages/e7/f6/b75d4816c32f1618ed31a005ee635dd1d91d8164495d94f2ea092f594661/pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204", size = 25148611 }, +] + +[[package]] +name = "pyarrow" +version = "18.1.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/7f/7b/640785a9062bb00314caa8a387abce547d2a420cf09bd6c715fe659ccffb/pyarrow-18.1.0.tar.gz", hash = "sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73", size = 1118671 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1a/bb/8d4a1573f66e0684f190dd2b55fd0b97a7214de8882d58a3867e777bf640/pyarrow-18.1.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e21488d5cfd3d8b500b3238a6c4b075efabc18f0f6d80b29239737ebd69caa6c", size = 29531620 }, + { url = "https://files.pythonhosted.org/packages/30/90/893acfad917533b624a97b9e498c0e8393908508a0a72d624fe935e632bf/pyarrow-18.1.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:b516dad76f258a702f7ca0250885fc93d1fa5ac13ad51258e39d402bd9e2e1e4", size = 30836521 }, + { url = "https://files.pythonhosted.org/packages/a3/2a/526545a7464b5fb2fa6e2c4bad16ca90e59e1843025c534fd907b7f73e5a/pyarrow-18.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f443122c8e31f4c9199cb23dca29ab9427cef990f283f80fe15b8e124bcc49b", size = 39213905 }, + { url = "https://files.pythonhosted.org/packages/8a/77/4b3fab91a30e19e233e738d0c5eca5a8f6dd05758bc349a2ca262c65de79/pyarrow-18.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0a03da7f2758645d17b7b4f83c8bffeae5bbb7f974523fe901f36288d2eab71", size = 40128881 }, + { url = "https://files.pythonhosted.org/packages/aa/e2/a88e16c5e45e562449c52305bd3bc2f9d704295322d3434656e7ccac1444/pyarrow-18.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ba17845efe3aa358ec266cf9cc2800fa73038211fb27968bfa88acd09261a470", size = 38627517 }, + { url = "https://files.pythonhosted.org/packages/6d/84/8037c20005ccc7b869726465be0957bd9c29cfc88612962030f08292ad06/pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:3c35813c11a059056a22a3bef520461310f2f7eea5c8a11ef9de7062a23f8d56", size = 40060187 }, + { url = "https://files.pythonhosted.org/packages/2a/38/d6435c723ff73df8ae74626ea778262fbcc2b9b0d1a4f3db915b61711b05/pyarrow-18.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9736ba3c85129d72aefa21b4f3bd715bc4190fe4426715abfff90481e7d00812", size = 25118314 }, + { url = "https://files.pythonhosted.org/packages/9e/4d/a4988e7d82f4fbc797715db4185939a658eeffb07a25bab7262bed1ea076/pyarrow-18.1.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:eaeabf638408de2772ce3d7793b2668d4bb93807deed1725413b70e3156a7854", size = 29554860 }, + { url = "https://files.pythonhosted.org/packages/59/03/3a42c5c1e4bd4c900ab62aa1ff6b472bdb159ba8f1c3e5deadab7222244f/pyarrow-18.1.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:3b2e2239339c538f3464308fd345113f886ad031ef8266c6f004d49769bb074c", size = 30867076 }, + { url = "https://files.pythonhosted.org/packages/75/7e/332055ac913373e89256dce9d14b7708f55f7bd5be631456c897f0237738/pyarrow-18.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f39a2e0ed32a0970e4e46c262753417a60c43a3246972cfc2d3eb85aedd01b21", size = 39212135 }, + { url = "https://files.pythonhosted.org/packages/8c/64/5099cdb325828722ef7ffeba9a4696f238eb0cdeae227f831c2d77fcf1bd/pyarrow-18.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e31e9417ba9c42627574bdbfeada7217ad8a4cbbe45b9d6bdd4b62abbca4c6f6", size = 40125195 }, + { url = "https://files.pythonhosted.org/packages/83/88/1938d783727db1b178ff71bc6a6143d7939e406db83a9ec23cad3dad325c/pyarrow-18.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:01c034b576ce0eef554f7c3d8c341714954be9b3f5d5bc7117006b85fcf302fe", size = 38641884 }, + { url = "https://files.pythonhosted.org/packages/5e/b5/9e14e9f7590e0eaa435ecea84dabb137284a4dbba7b3c337b58b65b76d95/pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f266a2c0fc31995a06ebd30bcfdb7f615d7278035ec5b1cd71c48d56daaf30b0", size = 40076877 }, + { url = "https://files.pythonhosted.org/packages/4d/a3/817ac7fe0891a2d66e247e223080f3a6a262d8aefd77e11e8c27e6acf4e1/pyarrow-18.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:d4f13eee18433f99adefaeb7e01d83b59f73360c231d4782d9ddfaf1c3fbde0a", size = 25119811 }, + { url = "https://files.pythonhosted.org/packages/6a/50/12829e7111b932581e51dda51d5cb39207a056c30fe31ef43f14c63c4d7e/pyarrow-18.1.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9f3a76670b263dc41d0ae877f09124ab96ce10e4e48f3e3e4257273cee61ad0d", size = 29514620 }, + { url = "https://files.pythonhosted.org/packages/d1/41/468c944eab157702e96abab3d07b48b8424927d4933541ab43788bb6964d/pyarrow-18.1.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:da31fbca07c435be88a0c321402c4e31a2ba61593ec7473630769de8346b54ee", size = 30856494 }, + { url = "https://files.pythonhosted.org/packages/68/f9/29fb659b390312a7345aeb858a9d9c157552a8852522f2c8bad437c29c0a/pyarrow-18.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:543ad8459bc438efc46d29a759e1079436290bd583141384c6f7a1068ed6f992", size = 39203624 }, + { url = "https://files.pythonhosted.org/packages/6e/f6/19360dae44200e35753c5c2889dc478154cd78e61b1f738514c9f131734d/pyarrow-18.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0743e503c55be0fdb5c08e7d44853da27f19dc854531c0570f9f394ec9671d54", size = 40139341 }, + { url = "https://files.pythonhosted.org/packages/bb/e6/9b3afbbcf10cc724312e824af94a2e993d8ace22994d823f5c35324cebf5/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d4b3d2a34780645bed6414e22dda55a92e0fcd1b8a637fba86800ad737057e33", size = 38618629 }, + { url = "https://files.pythonhosted.org/packages/3a/2e/3b99f8a3d9e0ccae0e961978a0d0089b25fb46ebbcfb5ebae3cca179a5b3/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c52f81aa6f6575058d8e2c782bf79d4f9fdc89887f16825ec3a66607a5dd8e30", size = 40078661 }, + { url = "https://files.pythonhosted.org/packages/76/52/f8da04195000099d394012b8d42c503d7041b79f778d854f410e5f05049a/pyarrow-18.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:0ad4892617e1a6c7a551cfc827e072a633eaff758fa09f21c4ee548c30bcaf99", size = 25092330 }, + { url = "https://files.pythonhosted.org/packages/cb/87/aa4d249732edef6ad88899399047d7e49311a55749d3c373007d034ee471/pyarrow-18.1.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:84e314d22231357d473eabec709d0ba285fa706a72377f9cc8e1cb3c8013813b", size = 29497406 }, + { url = "https://files.pythonhosted.org/packages/3c/c7/ed6adb46d93a3177540e228b5ca30d99fc8ea3b13bdb88b6f8b6467e2cb7/pyarrow-18.1.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:f591704ac05dfd0477bb8f8e0bd4b5dc52c1cadf50503858dce3a15db6e46ff2", size = 30835095 }, + { url = "https://files.pythonhosted.org/packages/41/d7/ed85001edfb96200ff606943cff71d64f91926ab42828676c0fc0db98963/pyarrow-18.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:acb7564204d3c40babf93a05624fc6a8ec1ab1def295c363afc40b0c9e66c191", size = 39194527 }, + { url = "https://files.pythonhosted.org/packages/59/16/35e28eab126342fa391593415d79477e89582de411bb95232f28b131a769/pyarrow-18.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74de649d1d2ccb778f7c3afff6085bd5092aed4c23df9feeb45dd6b16f3811aa", size = 40131443 }, + { url = "https://files.pythonhosted.org/packages/0c/95/e855880614c8da20f4cd74fa85d7268c725cf0013dc754048593a38896a0/pyarrow-18.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f96bd502cb11abb08efea6dab09c003305161cb6c9eafd432e35e76e7fa9b90c", size = 38608750 }, + { url = "https://files.pythonhosted.org/packages/54/9d/f253554b1457d4fdb3831b7bd5f8f00f1795585a606eabf6fec0a58a9c38/pyarrow-18.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:36ac22d7782554754a3b50201b607d553a8d71b78cdf03b33c1125be4b52397c", size = 40066690 }, + { url = "https://files.pythonhosted.org/packages/2f/58/8912a2563e6b8273e8aa7b605a345bba5a06204549826f6493065575ebc0/pyarrow-18.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:25dbacab8c5952df0ca6ca0af28f50d45bd31c1ff6fcf79e2d120b4a65ee7181", size = 25081054 }, + { url = "https://files.pythonhosted.org/packages/82/f9/d06ddc06cab1ada0c2f2fd205ac8c25c2701182de1b9c4bf7a0a44844431/pyarrow-18.1.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a276190309aba7bc9d5bd2933230458b3521a4317acfefe69a354f2fe59f2bc", size = 29525542 }, + { url = "https://files.pythonhosted.org/packages/ab/94/8917e3b961810587ecbdaa417f8ebac0abb25105ae667b7aa11c05876976/pyarrow-18.1.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ad514dbfcffe30124ce655d72771ae070f30bf850b48bc4d9d3b25993ee0e386", size = 30829412 }, + { url = "https://files.pythonhosted.org/packages/5e/e3/3b16c3190f3d71d3b10f6758d2d5f7779ef008c4fd367cedab3ed178a9f7/pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aebc13a11ed3032d8dd6e7171eb6e86d40d67a5639d96c35142bd568b9299324", size = 39119106 }, + { url = "https://files.pythonhosted.org/packages/1d/d6/5d704b0d25c3c79532f8c0639f253ec2803b897100f64bcb3f53ced236e5/pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6cf5c05f3cee251d80e98726b5c7cc9f21bab9e9783673bac58e6dfab57ecc8", size = 40090940 }, + { url = "https://files.pythonhosted.org/packages/37/29/366bc7e588220d74ec00e497ac6710c2833c9176f0372fe0286929b2d64c/pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:11b676cd410cf162d3f6a70b43fb9e1e40affbc542a1e9ed3681895f2962d3d9", size = 38548177 }, + { url = "https://files.pythonhosted.org/packages/c8/11/fabf6ecabb1fe5b7d96889228ca2a9158c4c3bb732e3b8ee3f7f6d40b703/pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b76130d835261b38f14fc41fdfb39ad8d672afb84c447126b84d5472244cfaba", size = 40043567 }, + { url = "https://files.pythonhosted.org/packages/fd/9b/60516e3876ec6f25b0909afa70f90a15de83b48c7c0d8042fac4e64c4411/pyarrow-18.1.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:0b331e477e40f07238adc7ba7469c36b908f07c89b95dd4bd3a0ec84a3d1e21e", size = 29543752 }, + { url = "https://files.pythonhosted.org/packages/14/a7/bd08b6f1a2bd2e71dc6bb0451fc1872607e44c83daf1ee63c82764a2d233/pyarrow-18.1.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:2c4dd0c9010a25ba03e198fe743b1cc03cd33c08190afff371749c52ccbbaf76", size = 30850753 }, + { url = "https://files.pythonhosted.org/packages/84/c9/62ef9c6281c0e5b4ee1afa9d7bd556e72e06da6706b7906c32c15e69b3d6/pyarrow-18.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f97b31b4c4e21ff58c6f330235ff893cc81e23da081b1a4b1c982075e0ed4e9", size = 39226870 }, + { url = "https://files.pythonhosted.org/packages/b2/99/a6e89e71655a38475e76b060777c8bf69c078b772bec3b7daf7361440f05/pyarrow-18.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a4813cb8ecf1809871fd2d64a8eff740a1bd3691bbe55f01a3cf6c5ec869754", size = 40139114 }, + { url = "https://files.pythonhosted.org/packages/64/a9/06d79923890682e4fe7a16524abee307407008a413115354aaf3226b8410/pyarrow-18.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:05a5636ec3eb5cc2a36c6edb534a38ef57b2ab127292a716d00eabb887835f1e", size = 38639231 }, + { url = "https://files.pythonhosted.org/packages/3b/8c/4c3ed19026a00740b81fe1c87f3ff235b2763a0a1ddf5711a9d026b775ce/pyarrow-18.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:73eeed32e724ea3568bb06161cad5fa7751e45bc2228e33dcb10c614044165c7", size = 40070949 }, + { url = "https://files.pythonhosted.org/packages/87/d8/94161a7ca5c55199484e926165e9e33f318ea1d1b0d7cdbcbc3652b933ec/pyarrow-18.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:a1880dd6772b685e803011a6b43a230c23b566859a6e0c9a276c1e0faf4f4052", size = 25301373 }, +] + +[[package]] +name = "pydata-sphinx-theme" +version = "0.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/d6/3921de802cf1ee771f0e76c9068b52498aeb8eeec6b830ff931c81c7ecf3/pydata_sphinx_theme-0.8.0.tar.gz", hash = "sha256:9f72015d9c572ea92e3007ab221a8325767c426783b6b9941813e65fa988dc90", size = 1123746 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/26/0694318d46c7d90ab602ae27b24431e939f1600f9a4c69d1e727ec57289f/pydata_sphinx_theme-0.8.0-py3-none-any.whl", hash = "sha256:fbcbb833a07d3ad8dd997dd40dc94da18d98b41c68123ab0182b58fe92271204", size = 3284997 }, +] + +[[package]] +name = "pygments" +version = "2.19.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, +] + +[[package]] +name = "pytest" +version = "8.3.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/05/35/30e0d83068951d90a01852cb1cef56e5d8a09d20c7f511634cc2f7e0372a/pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761", size = 1445919 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/92/76a1c94d3afee238333bc0a42b82935dd8f9cf8ce9e336ff87ee14d9e1cf/pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6", size = 343083 }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 }, +] + +[[package]] +name = "pytz" +version = "2024.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3a/31/3c70bf7603cc2dca0f19bdc53b4537a797747a58875b552c8c413d963a3f/pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a", size = 319692 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725", size = 508002 }, +] + +[[package]] +name = "pyyaml" +version = "6.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/95/a3fac87cb7158e231b5a6012e438c647e1a87f09f8e0d123acec8ab8bf71/PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086", size = 184199 }, + { url = "https://files.pythonhosted.org/packages/c7/7a/68bd47624dab8fd4afbfd3c48e3b79efe09098ae941de5b58abcbadff5cb/PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf", size = 171758 }, + { url = "https://files.pythonhosted.org/packages/49/ee/14c54df452143b9ee9f0f29074d7ca5516a36edb0b4cc40c3f280131656f/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237", size = 718463 }, + { url = "https://files.pythonhosted.org/packages/4d/61/de363a97476e766574650d742205be468921a7b532aa2499fcd886b62530/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b", size = 719280 }, + { url = "https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", size = 751239 }, + { url = "https://files.pythonhosted.org/packages/b7/33/5504b3a9a4464893c32f118a9cc045190a91637b119a9c881da1cf6b7a72/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180", size = 695802 }, + { url = "https://files.pythonhosted.org/packages/5c/20/8347dcabd41ef3a3cdc4f7b7a2aff3d06598c8779faa189cdbf878b626a4/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68", size = 720527 }, + { url = "https://files.pythonhosted.org/packages/be/aa/5afe99233fb360d0ff37377145a949ae258aaab831bde4792b32650a4378/PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99", size = 144052 }, + { url = "https://files.pythonhosted.org/packages/b5/84/0fa4b06f6d6c958d207620fc60005e241ecedceee58931bb20138e1e5776/PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e", size = 161774 }, + { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612 }, + { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040 }, + { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829 }, + { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167 }, + { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952 }, + { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301 }, + { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638 }, + { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850 }, + { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980 }, + { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873 }, + { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302 }, + { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154 }, + { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223 }, + { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542 }, + { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164 }, + { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611 }, + { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591 }, + { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338 }, + { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309 }, + { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679 }, + { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428 }, + { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361 }, + { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523 }, + { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660 }, + { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597 }, + { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527 }, + { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446 }, + { url = "https://files.pythonhosted.org/packages/74/d9/323a59d506f12f498c2097488d80d16f4cf965cee1791eab58b56b19f47a/PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a", size = 183218 }, + { url = "https://files.pythonhosted.org/packages/74/cc/20c34d00f04d785f2028737e2e2a8254e1425102e730fee1d6396f832577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5", size = 728067 }, + { url = "https://files.pythonhosted.org/packages/20/52/551c69ca1501d21c0de51ddafa8c23a0191ef296ff098e98358f69080577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d", size = 757812 }, + { url = "https://files.pythonhosted.org/packages/fd/7f/2c3697bba5d4aa5cc2afe81826d73dfae5f049458e44732c7a0938baa673/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083", size = 746531 }, + { url = "https://files.pythonhosted.org/packages/8c/ab/6226d3df99900e580091bb44258fde77a8433511a86883bd4681ea19a858/PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706", size = 800820 }, + { url = "https://files.pythonhosted.org/packages/a0/99/a9eb0f3e710c06c5d922026f6736e920d431812ace24aae38228d0d64b04/PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a", size = 145514 }, + { url = "https://files.pythonhosted.org/packages/75/8a/ee831ad5fafa4431099aa4e078d4c8efd43cd5e48fbc774641d233b683a9/PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff", size = 162702 }, + { url = "https://files.pythonhosted.org/packages/65/d8/b7a1db13636d7fb7d4ff431593c510c8b8fca920ade06ca8ef20015493c5/PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", size = 184777 }, + { url = "https://files.pythonhosted.org/packages/0a/02/6ec546cd45143fdf9840b2c6be8d875116a64076218b61d68e12548e5839/PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", size = 172318 }, + { url = "https://files.pythonhosted.org/packages/0e/9a/8cc68be846c972bda34f6c2a93abb644fb2476f4dcc924d52175786932c9/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", size = 720891 }, + { url = "https://files.pythonhosted.org/packages/e9/6c/6e1b7f40181bc4805e2e07f4abc10a88ce4648e7e95ff1abe4ae4014a9b2/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12", size = 722614 }, + { url = "https://files.pythonhosted.org/packages/3d/32/e7bd8535d22ea2874cef6a81021ba019474ace0d13a4819c2a4bce79bd6a/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19", size = 737360 }, + { url = "https://files.pythonhosted.org/packages/d7/12/7322c1e30b9be969670b672573d45479edef72c9a0deac3bb2868f5d7469/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e", size = 699006 }, + { url = "https://files.pythonhosted.org/packages/82/72/04fcad41ca56491995076630c3ec1e834be241664c0c09a64c9a2589b507/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725", size = 723577 }, + { url = "https://files.pythonhosted.org/packages/ed/5e/46168b1f2757f1fcd442bc3029cd8767d88a98c9c05770d8b420948743bb/PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631", size = 144593 }, + { url = "https://files.pythonhosted.org/packages/19/87/5124b1c1f2412bb95c59ec481eaf936cd32f0fe2a7b16b97b81c4c017a6a/PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8", size = 162312 }, +] + +[[package]] +name = "requests" +version = "2.32.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, +] + +[[package]] +name = "ruff" +version = "0.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/67/3e/e89f736f01aa9517a97e2e7e0ce8d34a4d8207087b3cfdec95133fee13b5/ruff-0.9.1.tar.gz", hash = "sha256:fd2b25ecaf907d6458fa842675382c8597b3c746a2dde6717fe3415425df0c17", size = 3498844 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/05/c3a2e0feb3d5d394cdfd552de01df9d3ec8a3a3771bbff247fab7e668653/ruff-0.9.1-py3-none-linux_armv6l.whl", hash = "sha256:84330dda7abcc270e6055551aca93fdde1b0685fc4fd358f26410f9349cf1743", size = 10645241 }, + { url = "https://files.pythonhosted.org/packages/dd/da/59f0a40e5f88ee5c054ad175caaa2319fc96571e1d29ab4730728f2aad4f/ruff-0.9.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:3cae39ba5d137054b0e5b472aee3b78a7c884e61591b100aeb544bcd1fc38d4f", size = 10391066 }, + { url = "https://files.pythonhosted.org/packages/b7/fe/85e1c1acf0ba04a3f2d54ae61073da030f7a5dc386194f96f3c6ca444a78/ruff-0.9.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:50c647ff96f4ba288db0ad87048257753733763b409b2faf2ea78b45c8bb7fcb", size = 10012308 }, + { url = "https://files.pythonhosted.org/packages/6f/9b/780aa5d4bdca8dcea4309264b8faa304bac30e1ce0bcc910422bfcadd203/ruff-0.9.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0c8b149e9c7353cace7d698e1656ffcf1e36e50f8ea3b5d5f7f87ff9986a7ca", size = 10881960 }, + { url = "https://files.pythonhosted.org/packages/12/f4/dac4361afbfe520afa7186439e8094e4884ae3b15c8fc75fb2e759c1f267/ruff-0.9.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:beb3298604540c884d8b282fe7625651378e1986c25df51dec5b2f60cafc31ce", size = 10414803 }, + { url = "https://files.pythonhosted.org/packages/f0/a2/057a3cb7999513cb78d6cb33a7d1cc6401c82d7332583786e4dad9e38e44/ruff-0.9.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:39d0174ccc45c439093971cc06ed3ac4dc545f5e8bdacf9f067adf879544d969", size = 11464929 }, + { url = "https://files.pythonhosted.org/packages/eb/c6/1ccfcc209bee465ced4874dcfeaadc88aafcc1ea9c9f31ef66f063c187f0/ruff-0.9.1-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:69572926c0f0c9912288915214ca9b2809525ea263603370b9e00bed2ba56dbd", size = 12170717 }, + { url = "https://files.pythonhosted.org/packages/84/97/4a524027518525c7cf6931e9fd3b2382be5e4b75b2b61bec02681a7685a5/ruff-0.9.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:937267afce0c9170d6d29f01fcd1f4378172dec6760a9f4dface48cdabf9610a", size = 11708921 }, + { url = "https://files.pythonhosted.org/packages/a6/a4/4e77cf6065c700d5593b25fca6cf725b1ab6d70674904f876254d0112ed0/ruff-0.9.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:186c2313de946f2c22bdf5954b8dd083e124bcfb685732cfb0beae0c47233d9b", size = 13058074 }, + { url = "https://files.pythonhosted.org/packages/f9/d6/fcb78e0531e863d0a952c4c5600cc5cd317437f0e5f031cd2288b117bb37/ruff-0.9.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f94942a3bb767675d9a051867c036655fe9f6c8a491539156a6f7e6b5f31831", size = 11281093 }, + { url = "https://files.pythonhosted.org/packages/e4/3b/7235bbeff00c95dc2d073cfdbf2b871b5bbf476754c5d277815d286b4328/ruff-0.9.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:728d791b769cc28c05f12c280f99e8896932e9833fef1dd8756a6af2261fd1ab", size = 10882610 }, + { url = "https://files.pythonhosted.org/packages/2a/66/5599d23257c61cf038137f82999ca8f9d0080d9d5134440a461bef85b461/ruff-0.9.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:2f312c86fb40c5c02b44a29a750ee3b21002bd813b5233facdaf63a51d9a85e1", size = 10489273 }, + { url = "https://files.pythonhosted.org/packages/78/85/de4aa057e2532db0f9761e2c2c13834991e087787b93e4aeb5f1cb10d2df/ruff-0.9.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:ae017c3a29bee341ba584f3823f805abbe5fe9cd97f87ed07ecbf533c4c88366", size = 11003314 }, + { url = "https://files.pythonhosted.org/packages/00/42/afedcaa089116d81447347f76041ff46025849fedb0ed2b187d24cf70fca/ruff-0.9.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5dc40a378a0e21b4cfe2b8a0f1812a6572fc7b230ef12cd9fac9161aa91d807f", size = 11342982 }, + { url = "https://files.pythonhosted.org/packages/39/c6/fe45f3eb27e3948b41a305d8b768e949bf6a39310e9df73f6c576d7f1d9f/ruff-0.9.1-py3-none-win32.whl", hash = "sha256:46ebf5cc106cf7e7378ca3c28ce4293b61b449cd121b98699be727d40b79ba72", size = 8819750 }, + { url = "https://files.pythonhosted.org/packages/38/8d/580db77c3b9d5c3d9479e55b0b832d279c30c8f00ab0190d4cd8fc67831c/ruff-0.9.1-py3-none-win_amd64.whl", hash = "sha256:342a824b46ddbcdddd3abfbb332fa7fcaac5488bf18073e841236aadf4ad5c19", size = 9701331 }, + { url = "https://files.pythonhosted.org/packages/b2/94/0498cdb7316ed67a1928300dd87d659c933479f44dec51b4f62bfd1f8028/ruff-0.9.1-py3-none-win_arm64.whl", hash = "sha256:1cd76c7f9c679e6e8f2af8f778367dca82b95009bc7b1a85a47f1521ae524fa7", size = 9145708 }, +] + +[[package]] +name = "setuptools" +version = "75.3.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/22/a438e0caa4576f8c383fa4d35f1cc01655a46c75be358960d815bfbb12bd/setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686", size = 1351577 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/12/282ee9bce8b58130cb762fbc9beabd531549952cac11fc56add11dcb7ea0/setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd", size = 1251070 }, +] + +[[package]] +name = "setuptools" +version = "75.8.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/92/ec/089608b791d210aec4e7f97488e67ab0d33add3efccb83a056cbafe3a2a6/setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6", size = 1343222 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/8a/b9dc7678803429e4a3bc9ba462fa3dd9066824d3c607490235c6a796be5a/setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3", size = 1228782 }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 }, +] + +[[package]] +name = "snowballstemmer" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/7b/af302bebf22c749c56c9c3e8ae13190b5b5db37a33d9068652e8f73b7089/snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", size = 86699 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/dc/c02e01294f7265e63a7315fe086dd1df7dacb9f840a804da846b96d01b96/snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a", size = 93002 }, +] + +[[package]] +name = "soupsieve" +version = "2.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/ce/fbaeed4f9fb8b2daa961f90591662df6a86c1abf25c548329a86920aedfb/soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb", size = 101569 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 }, +] + +[[package]] +name = "sphinx" +version = "7.1.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +dependencies = [ + { name = "alabaster", version = "0.7.13", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "babel", marker = "python_full_version < '3.9'" }, + { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" }, + { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "imagesize", marker = "python_full_version < '3.9'" }, + { name = "importlib-metadata", marker = "python_full_version < '3.9'" }, + { name = "jinja2", marker = "python_full_version < '3.9'" }, + { name = "packaging", marker = "python_full_version < '3.9'" }, + { name = "pygments", marker = "python_full_version < '3.9'" }, + { name = "requests", marker = "python_full_version < '3.9'" }, + { name = "snowballstemmer", marker = "python_full_version < '3.9'" }, + { name = "sphinxcontrib-applehelp", version = "1.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "sphinxcontrib-devhelp", version = "1.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "sphinxcontrib-htmlhelp", version = "2.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.9'" }, + { name = "sphinxcontrib-qthelp", version = "1.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "sphinxcontrib-serializinghtml", version = "1.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/01/688bdf9282241dca09fe6e3a1110eda399fa9b10d0672db609e37c2e7a39/sphinx-7.1.2.tar.gz", hash = "sha256:780f4d32f1d7d1126576e0e5ecc19dc32ab76cd24e950228dcf7b1f6d3d9e22f", size = 6828258 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/17/325cf6a257d84751a48ae90752b3d8fe0be8f9535b6253add61c49d0d9bc/sphinx-7.1.2-py3-none-any.whl", hash = "sha256:d170a81825b2fcacb6dfd5a0d7f578a053e45d3f2b153fecc948c37344eb4cbe", size = 3169543 }, +] + +[[package]] +name = "sphinx" +version = "7.4.7" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version == '3.9.*'", +] +dependencies = [ + { name = "alabaster", version = "0.7.16", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "babel", marker = "python_full_version == '3.9.*'" }, + { name = "colorama", marker = "python_full_version == '3.9.*' and sys_platform == 'win32'" }, + { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "imagesize", marker = "python_full_version == '3.9.*'" }, + { name = "importlib-metadata", marker = "python_full_version == '3.9.*'" }, + { name = "jinja2", marker = "python_full_version == '3.9.*'" }, + { name = "packaging", marker = "python_full_version == '3.9.*'" }, + { name = "pygments", marker = "python_full_version == '3.9.*'" }, + { name = "requests", marker = "python_full_version == '3.9.*'" }, + { name = "snowballstemmer", marker = "python_full_version == '3.9.*'" }, + { name = "sphinxcontrib-applehelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinxcontrib-devhelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinxcontrib-htmlhelp", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version == '3.9.*'" }, + { name = "sphinxcontrib-qthelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinxcontrib-serializinghtml", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "tomli", marker = "python_full_version == '3.9.*'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/be/50e50cb4f2eff47df05673d361095cafd95521d2a22521b920c67a372dcb/sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe", size = 8067911 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/ef/153f6803c5d5f8917dbb7f7fcf6d34a871ede3296fa89c2c703f5f8a6c8e/sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239", size = 3401624 }, +] + +[[package]] +name = "sphinx" +version = "8.1.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +dependencies = [ + { name = "alabaster", version = "1.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "babel", marker = "python_full_version >= '3.10'" }, + { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" }, + { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "imagesize", marker = "python_full_version >= '3.10'" }, + { name = "jinja2", marker = "python_full_version >= '3.10'" }, + { name = "packaging", marker = "python_full_version >= '3.10'" }, + { name = "pygments", marker = "python_full_version >= '3.10'" }, + { name = "requests", marker = "python_full_version >= '3.10'" }, + { name = "snowballstemmer", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-applehelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-devhelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-htmlhelp", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-qthelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-serializinghtml", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "tomli", marker = "python_full_version == '3.10.*'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/60/1ddff83a56d33aaf6f10ec8ce84b4c007d9368b21008876fceda7e7381ef/sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2", size = 3487125 }, +] + +[[package]] +name = "sphinx-autoapi" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "astroid", version = "3.2.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "astroid", version = "3.3.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "jinja2" }, + { name = "pyyaml" }, + { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "stdlib-list", version = "0.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "stdlib-list", version = "0.11.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4a/eb/cc243583bb1d518ca3b10998c203d919a8ed90affd4831f2b61ad09043d2/sphinx_autoapi-3.4.0.tar.gz", hash = "sha256:e6d5371f9411bbb9fca358c00a9e57aef3ac94cbfc5df4bab285946462f69e0c", size = 29292 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/d6/f2acdc2567337fd5f5dc091a4e58d8a0fb14927b9779fc1e5ecee96d9824/sphinx_autoapi-3.4.0-py3-none-any.whl", hash = "sha256:4027fef2875a22c5f2a57107c71641d82f6166bf55beb407a47aaf3ef14e7b92", size = 34095 }, +] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "1.0.4" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/32/df/45e827f4d7e7fcc84e853bcef1d836effd762d63ccb86f43ede4e98b478c/sphinxcontrib-applehelp-1.0.4.tar.gz", hash = "sha256:828f867945bbe39817c210a1abfd1bc4895c8b73fcaade56d45357a348a07d7e", size = 24766 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/c1/5e2cafbd03105ce50d8500f9b4e8a6e8d02e22d0475b574c3b3e9451a15f/sphinxcontrib_applehelp-1.0.4-py3-none-any.whl", hash = "sha256:29d341f67fb0f6f586b23ad80e072c8e6ad0b48417db2bde114a4c9746feb228", size = 120601 }, +] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300 }, +] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "1.0.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/98/33/dc28393f16385f722c893cb55539c641c9aaec8d1bc1c15b69ce0ac2dbb3/sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4", size = 17398 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/09/5de5ed43a521387f18bdf5f5af31d099605c992fd25372b2b9b825ce48ee/sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", size = 84690 }, +] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530 }, +] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.0.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/47/64cff68ea3aa450c373301e5bebfbb9fce0a3e70aca245fcadd4af06cd75/sphinxcontrib-htmlhelp-2.0.1.tar.gz", hash = "sha256:0cbdd302815330058422b98a113195c9249825d681e18f11e8b1f78a2f11efff", size = 27967 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/ee/a1f5e39046cbb5f8bc8fba87d1ddf1c6643fbc9194e58d26e606de4b9074/sphinxcontrib_htmlhelp-2.0.1-py3-none-any.whl", hash = "sha256:c38cb46dccf316c79de6e5515e1770414b797162b23cd3d06e67020e1d2a6903", size = 99833 }, +] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705 }, +] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071 }, +] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "1.0.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/8e/c4846e59f38a5f2b4a0e3b27af38f2fcf904d4bfd82095bf92de0b114ebd/sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", size = 21658 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/14/05f9206cf4e9cfca1afb5fd224c7cd434dcc3a433d6d9e4e0264d29c6cdb/sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6", size = 90609 }, +] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743 }, +] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "1.1.5" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/72/835d6fadb9e5d02304cf39b18f93d227cd93abd3c41ebf58e6853eeb1455/sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952", size = 21019 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/77/5464ec50dd0f1c1037e3c93249b040c8fc8078fdda97530eeb02424b6eea/sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", size = 94021 }, +] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072 }, +] + +[[package]] +name = "stack-data" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pure-eval" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521 }, +] + +[[package]] +name = "stdlib-list" +version = "0.10.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/39/bb/1cdbc326a5ab0026602e0489cbf02357e78140253c4b57cd866d380eb355/stdlib_list-0.10.0.tar.gz", hash = "sha256:6519c50d645513ed287657bfe856d527f277331540691ddeaf77b25459964a14", size = 59447 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/13/d9/9085375f0d23a4896b307bf14dcc61b49ec8cc67cb33e06cf95bf3af3966/stdlib_list-0.10.0-py3-none-any.whl", hash = "sha256:b3a911bc441d03e0332dd1a9e7d0870ba3bb0a542a74d7524f54fb431256e214", size = 79814 }, +] + +[[package]] +name = "stdlib-list" +version = "0.11.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/5d/04/6b37a71e92ddca16b190b7df62494ac4779d58ced4787f73584eb32c8f03/stdlib_list-0.11.0.tar.gz", hash = "sha256:b74a7b643a77a12637e907f3f62f0ab9f67300bce4014f6b2d3c8b4c8fd63c66", size = 60335 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/fe/e07300c027a868d32d8ed7a425503401e91a03ff90e7ca525c115c634ffb/stdlib_list-0.11.0-py3-none-any.whl", hash = "sha256:8bf8decfffaaf273d4cfeb5bd852b910a00dec1037dcf163576803622bccf597", size = 83617 }, +] + +[[package]] +name = "toml" +version = "0.10.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/be/ba/1f744cdc819428fc6b5084ec34d9b30660f6f9daaf70eead706e3203ec3c/toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f", size = 22253 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/44/6f/7120676b6d73228c96e17f1f794d8ab046fc910d781c8d151120c3f1569e/toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", size = 16588 }, +] + +[[package]] +name = "tomli" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077 }, + { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429 }, + { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067 }, + { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030 }, + { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898 }, + { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894 }, + { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319 }, + { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273 }, + { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310 }, + { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309 }, + { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762 }, + { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453 }, + { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486 }, + { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349 }, + { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159 }, + { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243 }, + { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645 }, + { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584 }, + { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875 }, + { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418 }, + { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708 }, + { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582 }, + { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543 }, + { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691 }, + { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170 }, + { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530 }, + { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666 }, + { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954 }, + { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724 }, + { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383 }, + { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 }, +] + +[[package]] +name = "traitlets" +version = "5.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359 }, +] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, +] + +[[package]] +name = "tzdata" +version = "2024.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e1/34/943888654477a574a86a98e9896bae89c7aa15078ec29f490fef2f1e5384/tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc", size = 193282 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 }, +] + +[[package]] +name = "urllib3" +version = "2.2.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338 }, +] + +[[package]] +name = "urllib3" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, +] + +[[package]] +name = "wcwidth" +version = "0.2.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, +] + +[[package]] +name = "zipp" +version = "3.20.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/54/bf/5c0000c44ebc80123ecbdddba1f5dcd94a5ada602a9c225d84b5aaa55e86/zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29", size = 24199 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/8b/5ba542fa83c90e09eac972fc9baca7a88e7e7ca4b221a89251954019308b/zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350", size = 9200 }, +] + +[[package]] +name = "zipp" +version = "3.21.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/3f/50/bad581df71744867e9468ebd0bcd6505de3b275e06f202c2cb016e3ff56f/zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4", size = 24545 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/1a/7e4798e9339adc931158c9d69ecc34f5e6791489d469f5e50ec15e35f458/zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931", size = 9630 }, +] From dfe4242ce4097a2f923939e443c6686c9d20c0af Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 22 Jan 2025 05:52:32 -0500 Subject: [PATCH 091/248] Update dependencies prior to release (#999) --- Cargo.lock | 247 ++++++++++++++++++++++++++++------------------------- Cargo.toml | 4 +- 2 files changed, 133 insertions(+), 118 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 105cc30c2..5a74a4839 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91839b07e474b3995035fd8ac33ee54f9c9ccbbb1ea33d9909c71bffdf1259d" +checksum = "eaf3437355979f1e93ba84ba108c38be5767713051f3c8ffbf07c094e2e61f9f" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "855c57c4efd26722b044dcd3e348252560e3e0333087fb9f6479dc0bf744054f" +checksum = "31dce77d2985522288edae7206bffd5fc4996491841dda01a13a58415867e681" dependencies = [ "arrow-array", "arrow-buffer", @@ -216,9 +216,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd03279cea46569acf9295f6224fbc370c5df184b4d2ecfe97ccb131d5615a7f" +checksum = "2d45fe6d3faed0435b7313e59a02583b14c6c6339fa7729e94c32a20af319a79" dependencies = [ "ahash", "arrow-buffer", @@ -233,9 +233,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e4a9b9b1d6d7117f6138e13bc4dd5daa7f94e671b70e8c9c4dc37b4f5ecfc16" +checksum = "2b02656a35cc103f28084bc80a0159668e0a680d919cef127bd7e0aaccb06ec1" dependencies = [ "bytes", "half", @@ -244,9 +244,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc70e39916e60c5b7af7a8e2719e3ae589326039e1e863675a008bee5ffe90fd" +checksum = "c73c6233c5b5d635a56f6010e6eb1ab9e30e94707db21cea03da317f67d84cf3" dependencies = [ "arrow-array", "arrow-buffer", @@ -265,9 +265,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789b2af43c1049b03a8d088ff6b2257cdcea1756cd76b174b1f2600356771b97" +checksum = "ec222848d70fea5a32af9c3602b08f5d740d5e2d33fbd76bf6fd88759b5b13a7" dependencies = [ "arrow-array", "arrow-buffer", @@ -284,9 +284,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e75edf21ffd53744a9b8e3ed11101f610e7ceb1a29860432824f1834a1f623" +checksum = "b7f2861ffa86f107b8ab577d86cff7c7a490243eabe961ba1e1af4f27542bb79" dependencies = [ "arrow-buffer", "arrow-schema", @@ -296,9 +296,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d186a909dece9160bf8312f5124d797884f608ef5435a36d9d608e0b2a9bcbf8" +checksum = "0270dc511f11bb5fa98a25020ad51a99ca5b08d8a8dfbd17503bb9dba0388f0b" dependencies = [ "arrow-array", "arrow-buffer", @@ -311,9 +311,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66ff2fedc1222942d0bd2fd391cb14a85baa3857be95c9373179bd616753b85" +checksum = "0eff38eeb8a971ad3a4caf62c5d57f0cff8a48b64a55e3207c4fd696a9234aad" dependencies = [ "arrow-array", "arrow-buffer", @@ -331,9 +331,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece7b5bc1180e6d82d1a60e1688c199829e8842e38497563c3ab6ea813e527fd" +checksum = "c6f202a879d287099139ff0d121e7f55ae5e0efe634b8cf2106ebc27a8715dee" dependencies = [ "arrow-array", "arrow-buffer", @@ -346,9 +346,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" +checksum = "a8f936954991c360ba762dff23f5dda16300774fafd722353d9683abd97630ae" dependencies = [ "ahash", "arrow-array", @@ -360,18 +360,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95513080e728e4cec37f1ff5af4f12c9688d47795d17cda80b6ec2cf74d4678" +checksum = "9579b9d8bce47aa41389fe344f2c6758279983b7c0ebb4013e283e3e91bb450e" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e415279094ea70323c032c6e739c48ad8d80e78a09bef7117b8718ad5bf3722" +checksum = "7471ba126d0b0aaa24b50a36bc6c25e4e74869a1fd1a5553357027a0b1c8d1f1" dependencies = [ "ahash", "arrow-array", @@ -383,9 +383,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d956cae7002eb8d83a27dbd34daaea1cf5b75852f0b84deb4d93a276e92bbf" +checksum = "72993b01cb62507b06f1fb49648d7286c8989ecfabdb7b77a750fcb54410731b" dependencies = [ "arrow-array", "arrow-buffer", @@ -444,7 +444,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -455,7 +455,7 @@ checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -528,9 +528,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.6.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" [[package]] name = "blake2" @@ -635,9 +635,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.7" +version = "1.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7" +checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229" dependencies = [ "jobserver", "libc", @@ -671,9 +671,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" +checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f" dependencies = [ "chrono", "chrono-tz-build", @@ -1189,7 +1189,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f5de3c8f386ea991696553afe241a326ecbc3c98a12c562867e4be754d3a060c" dependencies = [ "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -1406,7 +1406,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -1475,6 +1475,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1540,7 +1546,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -1662,6 +1668,11 @@ name = "hashbrown" version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "heck" @@ -1916,7 +1927,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -1994,9 +2005,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.76" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" dependencies = [ "once_cell", "wasm-bindgen", @@ -2152,9 +2163,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.22" +version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" [[package]] name = "lz4_flex" @@ -2218,9 +2229,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" +checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" dependencies = [ "adler2", ] @@ -2404,9 +2415,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" +checksum = "8957c0c95a6a1804f3e51a18f69df29be53856a8c5768cc9b6d00fcafcd2917c" dependencies = [ "ahash", "arrow-array", @@ -2579,19 +2590,19 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.27" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "483f8c21f64f3ea09fe0f30f5d48c3e8eefe5dac9129f0075f76593b4c1da705" +checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" dependencies = [ "proc-macro2", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] name = "proc-macro2" -version = "1.0.92" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" dependencies = [ "unicode-ident", ] @@ -2622,7 +2633,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.95", + "syn 2.0.96", "tempfile", ] @@ -2636,7 +2647,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -2726,7 +2737,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -2739,7 +2750,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -2771,7 +2782,7 @@ dependencies = [ "rustc-hash", "rustls", "socket2", - "thiserror 2.0.10", + "thiserror 2.0.11", "tokio", "tracing", ] @@ -2790,7 +2801,7 @@ dependencies = [ "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.10", + "thiserror 2.0.11", "tinyvec", "tracing", "web-time", @@ -2866,7 +2877,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -2875,7 +2886,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", ] [[package]] @@ -2915,11 +2926,11 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "regress" -version = "0.10.1" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1541daf4e4ed43a0922b7969bdc2170178bcacc5dabf7e39bc508a9fa3953a7a" +checksum = "4f56e622c2378013c6c61e2bd776604c46dc1087b2dc5293275a0c20a44f0771" dependencies = [ - "hashbrown 0.14.5", + "hashbrown 0.15.2", "memchr", ] @@ -3026,7 +3037,7 @@ version = "0.38.43" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "errno", "libc", "linux-raw-sys", @@ -3035,9 +3046,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.20" +version = "0.23.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" +checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8" dependencies = [ "once_cell", "ring", @@ -3139,7 +3150,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3154,7 +3165,7 @@ version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "core-foundation", "core-foundation-sys", "libc", @@ -3212,7 +3223,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3223,14 +3234,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] name = "serde_json" -version = "1.0.135" +version = "1.0.136" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9" +checksum = "336a0c23cf42a38d9eaa7cd22c7040d04e1228a19a933890805ffd00a16437d2" dependencies = [ "itoa", "memchr", @@ -3247,7 +3258,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3331,7 +3342,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3374,7 +3385,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3418,7 +3429,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3442,7 +3453,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.95", + "syn 2.0.96", "typify", "walkdir", ] @@ -3466,9 +3477,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.95" +version = "2.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46f71c0377baf4ef1cc3e3402ded576dccc315800fbc62dfc7fe04b009773b4a" +checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" dependencies = [ "proc-macro2", "quote", @@ -3492,7 +3503,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3526,11 +3537,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.10" +version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3ac7f54ca534db81081ef1c1e7f6ea8a3ef428d2fc069097c079443d24124d3" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" dependencies = [ - "thiserror-impl 2.0.10", + "thiserror-impl 2.0.11", ] [[package]] @@ -3541,18 +3552,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] name = "thiserror-impl" -version = "2.0.10" +version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e9465d30713b56a37ede7185763c3492a91be2f5fa68d958c44e41ab9248beb" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3624,7 +3635,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3696,7 +3707,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3762,7 +3773,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -3796,7 +3807,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.95", + "syn 2.0.96", "thiserror 1.0.69", "unicode-ident", ] @@ -3814,7 +3825,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.95", + "syn 2.0.96", "typify-impl", ] @@ -3879,9 +3890,9 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" +checksum = "744018581f9a3454a9e15beb8a33b017183f1e7c0cd170232a2d1453b23a51c4" dependencies = [ "getrandom", "serde", @@ -3920,34 +3931,35 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", "once_cell", + "rustversion", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" dependencies = [ "bumpalo", "log", "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.49" +version = "0.4.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38176d9b44ea84e9184eff0bc34cc167ed044f816accfe5922e54d84cf48eca2" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" dependencies = [ "cfg-if", "js-sys", @@ -3958,9 +3970,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3968,22 +3980,25 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] [[package]] name = "wasm-streams" @@ -4000,9 +4015,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.76" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" dependencies = [ "js-sys", "wasm-bindgen", @@ -4211,7 +4226,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", "synstructure", ] @@ -4233,7 +4248,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] @@ -4253,7 +4268,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", "synstructure", ] @@ -4282,7 +4297,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.95", + "syn 2.0.96", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 48219414a..10cffccb1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,7 @@ protoc = [ "datafusion-substrait/protoc" ] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.41", features = ["macros", "rt", "rt-multi-thread", "sync"] } +tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]} arrow = { version = "53", features = ["pyarrow"] } @@ -43,7 +43,7 @@ datafusion-substrait = { version = "44.0.0", optional = true } datafusion-proto = { version = "44.0.0" } datafusion-ffi = { version = "44.0.0" } prost = "0.13" # keep in line with `datafusion-substrait` -uuid = { version = "1.11", features = ["v4"] } +uuid = { version = "1.12", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } async-trait = "0.1" futures = "0.3" From 78e72c9445db4e78dcda2562e251beea4f1ad470 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 22 Jan 2025 05:53:13 -0500 Subject: [PATCH 092/248] Apply import ordering in ruff check (#1001) --- benchmarks/db-benchmark/groupby-datafusion.py | 12 +-- benchmarks/db-benchmark/join-datafusion.py | 6 +- benchmarks/tpch/tpch.py | 3 +- dev/release/generate-changelog.py | 5 +- examples/export.py | 1 - .../python/tests/_test_table_provider.py | 2 +- examples/import.py | 3 +- examples/python-udaf.py | 5 +- examples/python-udf-comparisons.py | 6 +- examples/python-udf.py | 3 +- examples/python-udwf.py | 7 +- examples/query-pyarrow-data.py | 3 +- examples/sql-parquet-s3.py | 1 + examples/sql-to-pandas.py | 1 - examples/sql-using-python-udaf.py | 2 +- examples/sql-using-python-udf.py | 2 +- examples/tpch/_tests.py | 6 +- examples/tpch/convert_data_to_parquet.py | 3 +- examples/tpch/q01_pricing_summary_report.py | 3 +- examples/tpch/q02_minimum_cost_supplier.py | 3 +- examples/tpch/q03_shipping_priority.py | 3 +- examples/tpch/q04_order_priority_checking.py | 4 +- examples/tpch/q05_local_supplier_volume.py | 5 +- .../tpch/q06_forecasting_revenue_change.py | 4 +- examples/tpch/q07_volume_shipping.py | 4 +- examples/tpch/q08_market_share.py | 4 +- .../tpch/q09_product_type_profit_measure.py | 3 +- examples/tpch/q10_returned_item_reporting.py | 4 +- .../q11_important_stock_identification.py | 3 +- examples/tpch/q12_ship_mode_order_priority.py | 4 +- examples/tpch/q13_customer_distribution.py | 3 +- examples/tpch/q14_promotion_effect.py | 4 +- examples/tpch/q15_top_supplier.py | 4 +- .../tpch/q16_part_supplier_relationship.py | 3 +- examples/tpch/q17_small_quantity_order.py | 3 +- examples/tpch/q18_large_volume_customer.py | 3 +- examples/tpch/q19_discounted_revenue.py | 3 +- examples/tpch/q20_potential_part_promotion.py | 4 +- .../tpch/q21_suppliers_kept_orders_waiting.py | 3 +- examples/tpch/q22_global_sales_opportunity.py | 3 +- pyproject.toml | 4 +- python/datafusion/__init__.py | 30 +++---- python/datafusion/catalog.py | 4 +- python/datafusion/common.py | 3 +- python/datafusion/context.py | 23 ++--- python/datafusion/dataframe.py | 21 +++-- python/datafusion/expr.py | 5 +- python/datafusion/functions.py | 16 ++-- python/datafusion/input/location.py | 2 +- python/datafusion/plan.py | 4 +- python/datafusion/record_batch.py | 3 +- python/datafusion/substrait.py | 8 +- python/datafusion/udf.py | 9 +- python/tests/conftest.py | 2 +- python/tests/test_aggregation.py | 1 - python/tests/test_config.py | 2 +- python/tests/test_context.py | 3 +- python/tests/test_dataframe.py | 5 +- python/tests/test_functions.py | 6 +- python/tests/test_imports.py | 89 +++++++++---------- python/tests/test_indexing.py | 1 - python/tests/test_input.py | 1 + python/tests/test_plans.py | 2 +- python/tests/test_sql.py | 5 +- python/tests/test_store.py | 1 - python/tests/test_substrait.py | 3 +- python/tests/test_udaf.py | 1 - python/tests/test_udf.py | 2 +- python/tests/test_udwf.py | 6 +- 69 files changed, 221 insertions(+), 189 deletions(-) diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py index 960c8ba9a..04bf7a149 100644 --- a/benchmarks/db-benchmark/groupby-datafusion.py +++ b/benchmarks/db-benchmark/groupby-datafusion.py @@ -15,21 +15,23 @@ # specific language governing permissions and limitations # under the License. -import os import gc +import os import timeit + import datafusion as df +import pyarrow from datafusion import ( - col, - functions as f, RuntimeEnvBuilder, SessionConfig, SessionContext, + col, +) +from datafusion import ( + functions as f, ) -import pyarrow from pyarrow import csv as pacsv - print("# groupby-datafusion.py", flush=True) exec(open("./_helpers/helpers.py").read()) diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py index 811ad8707..b45ebf632 100755 --- a/benchmarks/db-benchmark/join-datafusion.py +++ b/benchmarks/db-benchmark/join-datafusion.py @@ -15,15 +15,15 @@ # specific language governing permissions and limitations # under the License. -import os import gc +import os import timeit + import datafusion as df -from datafusion import functions as f from datafusion import col +from datafusion import functions as f from pyarrow import csv as pacsv - print("# join-datafusion.py", flush=True) exec(open("./_helpers/helpers.py").read()) diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index daa831b55..fb86b12b6 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -16,9 +16,10 @@ # under the License. import argparse -from datafusion import SessionContext import time +from datafusion import SessionContext + def bench(data_path, query_path): with open("results.csv", "w") as results: diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index 0f07457d0..2564eea86 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -16,11 +16,12 @@ # limitations under the License. import argparse -import sys -from github import Github import os import re import subprocess +import sys + +from github import Github def print_pulls(repo_name, title, pulls): diff --git a/examples/export.py b/examples/export.py index cc02de52b..c7a387bcb 100644 --- a/examples/export.py +++ b/examples/export.py @@ -17,7 +17,6 @@ import datafusion - # create a context ctx = datafusion.SessionContext() diff --git a/examples/ffi-table-provider/python/tests/_test_table_provider.py b/examples/ffi-table-provider/python/tests/_test_table_provider.py index 56c05e4fa..0db3ec561 100644 --- a/examples/ffi-table-provider/python/tests/_test_table_provider.py +++ b/examples/ffi-table-provider/python/tests/_test_table_provider.py @@ -15,9 +15,9 @@ # specific language governing permissions and limitations # under the License. +import pyarrow as pa from datafusion import SessionContext from ffi_table_provider import MyTableProvider -import pyarrow as pa def test_table_loading(): diff --git a/examples/import.py b/examples/import.py index c9d2e8cb6..7b5ab5082 100644 --- a/examples/import.py +++ b/examples/import.py @@ -16,10 +16,9 @@ # under the License. import datafusion -import pyarrow as pa import pandas as pd import polars as pl - +import pyarrow as pa # Create a context ctx = datafusion.SessionContext() diff --git a/examples/python-udaf.py b/examples/python-udaf.py index ed705f5a9..538f69571 100644 --- a/examples/python-udaf.py +++ b/examples/python-udaf.py @@ -15,11 +15,10 @@ # specific language governing permissions and limitations # under the License. +import datafusion import pyarrow import pyarrow.compute -import datafusion -from datafusion import udaf, Accumulator -from datafusion import col +from datafusion import Accumulator, col, udaf class MyAccumulator(Accumulator): diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py index 9a84dd730..c5d5ec8dd 100644 --- a/examples/python-udf-comparisons.py +++ b/examples/python-udf-comparisons.py @@ -15,11 +15,13 @@ # specific language governing permissions and limitations # under the License. -from datafusion import SessionContext, col, lit, udf, functions as F import os +import time + import pyarrow as pa import pyarrow.compute as pc -import time +from datafusion import SessionContext, col, lit, udf +from datafusion import functions as F path = os.path.dirname(os.path.abspath(__file__)) filepath = os.path.join(path, "./tpch/data/lineitem.parquet") diff --git a/examples/python-udf.py b/examples/python-udf.py index 30edd4198..fb2bc253e 100644 --- a/examples/python-udf.py +++ b/examples/python-udf.py @@ -16,7 +16,8 @@ # under the License. import pyarrow -from datafusion import udf, SessionContext, functions as f +from datafusion import SessionContext, udf +from datafusion import functions as f def is_null(array: pyarrow.Array) -> pyarrow.Array: diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 55de2bdc7..32f8fadaa 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -15,11 +15,12 @@ # specific language governing permissions and limitations # under the License. -import pyarrow as pa import datafusion -from datafusion import udwf, functions as f, col, lit -from datafusion.udf import WindowEvaluator +import pyarrow as pa +from datafusion import col, lit, udwf +from datafusion import functions as f from datafusion.expr import WindowFrame +from datafusion.udf import WindowEvaluator # This example creates five different examples of user defined window functions in order # to demonstrate the variety of ways a user may need to implement. diff --git a/examples/query-pyarrow-data.py b/examples/query-pyarrow-data.py index 83e6884a7..e3456fb5b 100644 --- a/examples/query-pyarrow-data.py +++ b/examples/query-pyarrow-data.py @@ -16,9 +16,8 @@ # under the License. import datafusion -from datafusion import col import pyarrow - +from datafusion import col # create a context ctx = datafusion.SessionContext() diff --git a/examples/sql-parquet-s3.py b/examples/sql-parquet-s3.py index 61f1e0c50..866e2ac68 100644 --- a/examples/sql-parquet-s3.py +++ b/examples/sql-parquet-s3.py @@ -16,6 +16,7 @@ # under the License. import os + import datafusion from datafusion.object_store import AmazonS3 diff --git a/examples/sql-to-pandas.py b/examples/sql-to-pandas.py index 3e99b22de..34f7bde1b 100644 --- a/examples/sql-to-pandas.py +++ b/examples/sql-to-pandas.py @@ -17,7 +17,6 @@ from datafusion import SessionContext - # Create a DataFusion context ctx = SessionContext() diff --git a/examples/sql-using-python-udaf.py b/examples/sql-using-python-udaf.py index 7ccf5d3cb..60ab8d134 100644 --- a/examples/sql-using-python-udaf.py +++ b/examples/sql-using-python-udaf.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -from datafusion import udaf, SessionContext, Accumulator import pyarrow as pa +from datafusion import Accumulator, SessionContext, udaf # Define a user-defined aggregation function (UDAF) diff --git a/examples/sql-using-python-udf.py b/examples/sql-using-python-udf.py index d6bbe3ab0..2f0a0b67d 100644 --- a/examples/sql-using-python-udf.py +++ b/examples/sql-using-python-udf.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -from datafusion import udf, SessionContext import pyarrow as pa +from datafusion import SessionContext, udf # Define a user-defined function (UDF) diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index 3ce9cdfe5..c4d872085 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -15,10 +15,12 @@ # specific language governing permissions and limitations # under the License. -import pytest from importlib import import_module + import pyarrow as pa -from datafusion import DataFrame, col, lit, functions as F +import pytest +from datafusion import DataFrame, col, lit +from datafusion import functions as F from util import get_answer_file diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py index cb0b2f0bd..73097fac5 100644 --- a/examples/tpch/convert_data_to_parquet.py +++ b/examples/tpch/convert_data_to_parquet.py @@ -23,8 +23,9 @@ """ import os -import pyarrow + import datafusion +import pyarrow ctx = datafusion.SessionContext() diff --git a/examples/tpch/q01_pricing_summary_report.py b/examples/tpch/q01_pricing_summary_report.py index cb9485a7a..3f97f00dc 100644 --- a/examples/tpch/q01_pricing_summary_report.py +++ b/examples/tpch/q01_pricing_summary_report.py @@ -30,7 +30,8 @@ """ import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path ctx = SessionContext() diff --git a/examples/tpch/q02_minimum_cost_supplier.py b/examples/tpch/q02_minimum_cost_supplier.py index c4ccf8ad3..7390d0892 100644 --- a/examples/tpch/q02_minimum_cost_supplier.py +++ b/examples/tpch/q02_minimum_cost_supplier.py @@ -30,7 +30,8 @@ """ import datafusion -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path # This is the part we're looking for. Values selected here differ from the spec in order to run diff --git a/examples/tpch/q03_shipping_priority.py b/examples/tpch/q03_shipping_priority.py index 5ebab13c0..fc1231e0a 100644 --- a/examples/tpch/q03_shipping_priority.py +++ b/examples/tpch/q03_shipping_priority.py @@ -27,7 +27,8 @@ as part of their TPC Benchmark H Specification revision 2.18.0. """ -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path SEGMENT_OF_INTEREST = "BUILDING" diff --git a/examples/tpch/q04_order_priority_checking.py b/examples/tpch/q04_order_priority_checking.py index 8bf02cb83..426338aea 100644 --- a/examples/tpch/q04_order_priority_checking.py +++ b/examples/tpch/q04_order_priority_checking.py @@ -27,8 +27,10 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path # Ideally we could put 3 months into the interval. See note below. diff --git a/examples/tpch/q05_local_supplier_volume.py b/examples/tpch/q05_local_supplier_volume.py index 413a4acb9..fa2b01dea 100644 --- a/examples/tpch/q05_local_supplier_volume.py +++ b/examples/tpch/q05_local_supplier_volume.py @@ -30,11 +30,12 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path - DATE_OF_INTEREST = "1994-01-01" INTERVAL_DAYS = 365 REGION_OF_INTEREST = "ASIA" diff --git a/examples/tpch/q06_forecasting_revenue_change.py b/examples/tpch/q06_forecasting_revenue_change.py index eaf9b0c29..1de5848b1 100644 --- a/examples/tpch/q06_forecasting_revenue_change.py +++ b/examples/tpch/q06_forecasting_revenue_change.py @@ -30,8 +30,10 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path # Variables from the example query diff --git a/examples/tpch/q07_volume_shipping.py b/examples/tpch/q07_volume_shipping.py index 18c290d9c..a84cf728a 100644 --- a/examples/tpch/q07_volume_shipping.py +++ b/examples/tpch/q07_volume_shipping.py @@ -29,8 +29,10 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path # Variables of interest to query over diff --git a/examples/tpch/q08_market_share.py b/examples/tpch/q08_market_share.py index 7138ab65a..d46df30f2 100644 --- a/examples/tpch/q08_market_share.py +++ b/examples/tpch/q08_market_share.py @@ -28,8 +28,10 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path supplier_nation = lit("BRAZIL") diff --git a/examples/tpch/q09_product_type_profit_measure.py b/examples/tpch/q09_product_type_profit_measure.py index aa47d76c0..e2abbd095 100644 --- a/examples/tpch/q09_product_type_profit_measure.py +++ b/examples/tpch/q09_product_type_profit_measure.py @@ -30,7 +30,8 @@ """ import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path part_color = lit("green") diff --git a/examples/tpch/q10_returned_item_reporting.py b/examples/tpch/q10_returned_item_reporting.py index 94b398c1d..ed822e264 100644 --- a/examples/tpch/q10_returned_item_reporting.py +++ b/examples/tpch/q10_returned_item_reporting.py @@ -30,8 +30,10 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path DATE_START_OF_QUARTER = "1993-10-01" diff --git a/examples/tpch/q11_important_stock_identification.py b/examples/tpch/q11_important_stock_identification.py index 707265e16..22829ab7c 100644 --- a/examples/tpch/q11_important_stock_identification.py +++ b/examples/tpch/q11_important_stock_identification.py @@ -27,7 +27,8 @@ as part of their TPC Benchmark H Specification revision 2.18.0. """ -from datafusion import SessionContext, WindowFrame, col, lit, functions as F +from datafusion import SessionContext, WindowFrame, col, lit +from datafusion import functions as F from util import get_data_path NATION = "GERMANY" diff --git a/examples/tpch/q12_ship_mode_order_priority.py b/examples/tpch/q12_ship_mode_order_priority.py index def2a6c30..f1d894940 100644 --- a/examples/tpch/q12_ship_mode_order_priority.py +++ b/examples/tpch/q12_ship_mode_order_priority.py @@ -30,8 +30,10 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path SHIP_MODE_1 = "MAIL" diff --git a/examples/tpch/q13_customer_distribution.py b/examples/tpch/q13_customer_distribution.py index 67365a96a..93f082ea3 100644 --- a/examples/tpch/q13_customer_distribution.py +++ b/examples/tpch/q13_customer_distribution.py @@ -28,7 +28,8 @@ as part of their TPC Benchmark H Specification revision 2.18.0. """ -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path WORD_1 = "special" diff --git a/examples/tpch/q14_promotion_effect.py b/examples/tpch/q14_promotion_effect.py index cd26ee2bd..d62f76e3c 100644 --- a/examples/tpch/q14_promotion_effect.py +++ b/examples/tpch/q14_promotion_effect.py @@ -27,8 +27,10 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path DATE = "1995-09-01" diff --git a/examples/tpch/q15_top_supplier.py b/examples/tpch/q15_top_supplier.py index 0bc316f7a..c321048f2 100644 --- a/examples/tpch/q15_top_supplier.py +++ b/examples/tpch/q15_top_supplier.py @@ -27,8 +27,10 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, WindowFrame, col, lit, functions as F +from datafusion import SessionContext, WindowFrame, col, lit +from datafusion import functions as F from util import get_data_path DATE = "1996-01-01" diff --git a/examples/tpch/q16_part_supplier_relationship.py b/examples/tpch/q16_part_supplier_relationship.py index a6a0c43eb..65043ffda 100644 --- a/examples/tpch/q16_part_supplier_relationship.py +++ b/examples/tpch/q16_part_supplier_relationship.py @@ -29,7 +29,8 @@ """ import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path BRAND = "Brand#45" diff --git a/examples/tpch/q17_small_quantity_order.py b/examples/tpch/q17_small_quantity_order.py index d7b43d498..6d76fe506 100644 --- a/examples/tpch/q17_small_quantity_order.py +++ b/examples/tpch/q17_small_quantity_order.py @@ -28,7 +28,8 @@ as part of their TPC Benchmark H Specification revision 2.18.0. """ -from datafusion import SessionContext, WindowFrame, col, lit, functions as F +from datafusion import SessionContext, WindowFrame, col, lit +from datafusion import functions as F from util import get_data_path BRAND = "Brand#23" diff --git a/examples/tpch/q18_large_volume_customer.py b/examples/tpch/q18_large_volume_customer.py index 165fce033..834d181c9 100644 --- a/examples/tpch/q18_large_volume_customer.py +++ b/examples/tpch/q18_large_volume_customer.py @@ -26,7 +26,8 @@ as part of their TPC Benchmark H Specification revision 2.18.0. """ -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path QUANTITY = 300 diff --git a/examples/tpch/q19_discounted_revenue.py b/examples/tpch/q19_discounted_revenue.py index 4aed0cbae..2b87e1120 100644 --- a/examples/tpch/q19_discounted_revenue.py +++ b/examples/tpch/q19_discounted_revenue.py @@ -27,7 +27,8 @@ """ import pyarrow as pa -from datafusion import SessionContext, col, lit, udf, functions as F +from datafusion import SessionContext, col, lit, udf +from datafusion import functions as F from util import get_data_path items_of_interest = { diff --git a/examples/tpch/q20_potential_part_promotion.py b/examples/tpch/q20_potential_part_promotion.py index d720cdce6..a25188d31 100644 --- a/examples/tpch/q20_potential_part_promotion.py +++ b/examples/tpch/q20_potential_part_promotion.py @@ -28,8 +28,10 @@ """ from datetime import datetime + import pyarrow as pa -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path COLOR_OF_INTEREST = "forest" diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py b/examples/tpch/q21_suppliers_kept_orders_waiting.py index 27cf816fa..9bbaad779 100644 --- a/examples/tpch/q21_suppliers_kept_orders_waiting.py +++ b/examples/tpch/q21_suppliers_kept_orders_waiting.py @@ -26,7 +26,8 @@ as part of their TPC Benchmark H Specification revision 2.18.0. """ -from datafusion import SessionContext, col, lit, functions as F +from datafusion import SessionContext, col, lit +from datafusion import functions as F from util import get_data_path NATION_OF_INTEREST = "SAUDI ARABIA" diff --git a/examples/tpch/q22_global_sales_opportunity.py b/examples/tpch/q22_global_sales_opportunity.py index 72dce5289..c4d115b74 100644 --- a/examples/tpch/q22_global_sales_opportunity.py +++ b/examples/tpch/q22_global_sales_opportunity.py @@ -26,7 +26,8 @@ as part of their TPC Benchmark H Specification revision 2.18.0. """ -from datafusion import SessionContext, WindowFrame, col, lit, functions as F +from datafusion import SessionContext, WindowFrame, col, lit +from datafusion import functions as F from util import get_data_path NATION_CODES = [13, 31, 23, 29, 30, 18, 17] diff --git a/pyproject.toml b/pyproject.toml index 6e8acfe71..32bb28d21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ features = ["substrait"] # Enable docstring linting using the google style guide [tool.ruff.lint] -select = ["E4", "E7", "E9", "F", "D", "W"] +select = ["E4", "E7", "E9", "F", "D", "W", "I"] [tool.ruff.lint.pydocstyle] convention = "google" @@ -100,4 +100,4 @@ docs = [ "pickleshare>=0.7.5", "sphinx-autoapi>=3.4.0", "setuptools>=75.3.0", -] \ No newline at end of file +] diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 2d8db42c8..85aefcce7 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -26,36 +26,28 @@ except ImportError: import importlib_metadata -from .context import ( - SessionContext, - SessionConfig, - RuntimeEnvBuilder, - SQLOptions, -) - -from .catalog import Catalog, Database, Table +from . import functions, object_store, substrait # The following imports are okay to remain as opaque to the user. from ._internal import Config - -from .record_batch import RecordBatchStream, RecordBatch - -from .udf import ScalarUDF, AggregateUDF, Accumulator, WindowUDF - +from .catalog import Catalog, Database, Table from .common import ( DFSchema, ) - +from .context import ( + RuntimeEnvBuilder, + SessionConfig, + SessionContext, + SQLOptions, +) from .dataframe import DataFrame - from .expr import ( Expr, WindowFrame, ) - -from .plan import LogicalPlan, ExecutionPlan - -from . import functions, object_store, substrait +from .plan import ExecutionPlan, LogicalPlan +from .record_batch import RecordBatch, RecordBatchStream +from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF __version__ = importlib_metadata.version(__name__) diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index acd28f33d..703037665 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -19,10 +19,10 @@ from __future__ import annotations -import datafusion._internal as df_internal - from typing import TYPE_CHECKING +import datafusion._internal as df_internal + if TYPE_CHECKING: import pyarrow diff --git a/python/datafusion/common.py b/python/datafusion/common.py index 7db8333f2..a2298c634 100644 --- a/python/datafusion/common.py +++ b/python/datafusion/common.py @@ -16,9 +16,10 @@ # under the License. """Common data types used throughout the DataFusion project.""" -from ._internal import common as common_internal from enum import Enum +from ._internal import common as common_internal + # TODO these should all have proper wrapper classes DFSchema = common_internal.DFSchema diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 3c284c9f9..864ef1c8b 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -19,26 +19,29 @@ from __future__ import annotations -from ._internal import SessionConfig as SessionConfigInternal -from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal -from ._internal import SQLOptions as SQLOptionsInternal -from ._internal import SessionContext as SessionContextInternal +from typing import TYPE_CHECKING, Any, Protocol + +from typing_extensions import deprecated from datafusion.catalog import Catalog, Table from datafusion.dataframe import DataFrame from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list from datafusion.record_batch import RecordBatchStream -from datafusion.udf import ScalarUDF, AggregateUDF, WindowUDF +from datafusion.udf import AggregateUDF, ScalarUDF, WindowUDF -from typing import Any, TYPE_CHECKING, Protocol -from typing_extensions import deprecated +from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal +from ._internal import SessionConfig as SessionConfigInternal +from ._internal import SessionContext as SessionContextInternal +from ._internal import SQLOptions as SQLOptionsInternal if TYPE_CHECKING: - import pyarrow + import pathlib + import pandas import polars - import pathlib - from datafusion.plan import LogicalPlan, ExecutionPlan + import pyarrow + + from datafusion.plan import ExecutionPlan, LogicalPlan class ArrowStreamExportable(Protocol): diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index b0c1abdad..7413a5fa3 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -20,31 +20,36 @@ """ from __future__ import annotations + import warnings from typing import ( + TYPE_CHECKING, Any, Iterable, List, - TYPE_CHECKING, Literal, - overload, Optional, Union, + overload, ) -from datafusion.record_batch import RecordBatchStream + from typing_extensions import deprecated -from datafusion.plan import LogicalPlan, ExecutionPlan + +from datafusion.plan import ExecutionPlan, LogicalPlan +from datafusion.record_batch import RecordBatchStream if TYPE_CHECKING: - import pyarrow as pa - import pandas as pd - import polars as pl import pathlib from typing import Callable, Sequence + import pandas as pd + import polars as pl + import pyarrow as pa + +from enum import Enum + from datafusion._internal import DataFrame as DataFrameInternal from datafusion.expr import Expr, SortExpr, sort_or_default -from enum import Enum # excerpt from deltalake diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 16add16f4..68ddd7c9a 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -22,12 +22,13 @@ from __future__ import annotations -from typing import Any, Optional, Type, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Optional, Type import pyarrow as pa -from datafusion.common import DataTypeMap, NullTreatment, RexType from typing_extensions import deprecated +from datafusion.common import DataTypeMap, NullTreatment, RexType + from ._internal import expr as expr_internal from ._internal import functions as functions_internal diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index c0097c6ab..7c2fa9a8f 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,21 +18,21 @@ from __future__ import annotations +from typing import Any, Optional + +import pyarrow as pa + from datafusion._internal import functions as f +from datafusion.common import NullTreatment +from datafusion.context import SessionContext from datafusion.expr import ( CaseBuilder, Expr, - WindowFrame, SortExpr, - sort_list_to_raw_sort_list, + WindowFrame, expr_list_to_raw_expr_list, + sort_list_to_raw_sort_list, ) -from datafusion.context import SessionContext -from datafusion.common import NullTreatment - -from typing import Any, Optional - -import pyarrow as pa __all__ = [ "abs", diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index b274539fc..a8252b53c 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -17,8 +17,8 @@ """The default input source for DataFusion.""" -import os import glob +import os from typing import Any from datafusion.common import DataTypeMap, SqlTable diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index a71965f41..133fc446d 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -19,9 +19,9 @@ from __future__ import annotations -import datafusion._internal as df_internal +from typing import TYPE_CHECKING, Any, List -from typing import List, Any, TYPE_CHECKING +import datafusion._internal as df_internal if TYPE_CHECKING: from datafusion.context import SessionContext diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py index 75e58998f..772cd9089 100644 --- a/python/datafusion/record_batch.py +++ b/python/datafusion/record_batch.py @@ -27,9 +27,10 @@ if TYPE_CHECKING: import pyarrow - import datafusion._internal as df_internal import typing_extensions + import datafusion._internal as df_internal + class RecordBatch: """This class is essentially a wrapper for :py:class:`pyarrow.RecordBatch`.""" diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index dea47acca..402184d3f 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -23,13 +23,15 @@ from __future__ import annotations -from ._internal import substrait as substrait_internal - +import pathlib from typing import TYPE_CHECKING + from typing_extensions import deprecated -import pathlib + from datafusion.plan import LogicalPlan +from ._internal import substrait as substrait_internal + if TYPE_CHECKING: from datafusion.context import SessionContext diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index d9d994b22..c97f453d0 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -19,14 +19,15 @@ from __future__ import annotations -import datafusion._internal as df_internal -from datafusion.expr import Expr -from typing import Callable, TYPE_CHECKING, TypeVar from abc import ABCMeta, abstractmethod -from typing import List, Optional from enum import Enum +from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar + import pyarrow +import datafusion._internal as df_internal +from datafusion.expr import Expr + if TYPE_CHECKING: _R = TypeVar("_R", bound=pyarrow.DataType) diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 1cc07e500..9548fbfe4 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -15,9 +15,9 @@ # specific language governing permissions and limitations # under the License. +import pyarrow as pa import pytest from datafusion import SessionContext -import pyarrow as pa from pyarrow.csv import write_csv diff --git a/python/tests/test_aggregation.py b/python/tests/test_aggregation.py index 243a8c3c9..5ef46131b 100644 --- a/python/tests/test_aggregation.py +++ b/python/tests/test_aggregation.py @@ -18,7 +18,6 @@ import numpy as np import pyarrow as pa import pytest - from datafusion import SessionContext, column, lit from datafusion import functions as f from datafusion.common import NullTreatment diff --git a/python/tests/test_config.py b/python/tests/test_config.py index 12d9fc3ff..c1d7f97e1 100644 --- a/python/tests/test_config.py +++ b/python/tests/test_config.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -from datafusion import Config import pytest +from datafusion import Config @pytest.fixture diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 10e8ad0e9..91046e6b8 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -14,15 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import datetime as dt import gzip import os -import datetime as dt import pathlib import pyarrow as pa import pyarrow.dataset as ds import pytest - from datafusion import ( DataFrame, RuntimeEnvBuilder, diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index a1a871e9a..5bc3fb094 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -18,11 +18,8 @@ from typing import Any import pyarrow as pa -from pyarrow.csv import write_csv import pyarrow.parquet as pq import pytest - -from datafusion import functions as f from datafusion import ( DataFrame, SessionContext, @@ -30,7 +27,9 @@ column, literal, ) +from datafusion import functions as f from datafusion.expr import Window +from pyarrow.csv import write_csv @pytest.fixture diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index add170c17..ad6aa7c0a 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -15,15 +15,13 @@ # specific language governing permissions and limitations # under the License. import math +from datetime import datetime import numpy as np import pyarrow as pa import pytest -from datetime import datetime - -from datafusion import SessionContext, column +from datafusion import SessionContext, column, literal, string_literal from datafusion import functions as f -from datafusion import literal, string_literal np.seterr(invalid="ignore") diff --git a/python/tests/test_imports.py b/python/tests/test_imports.py index 6ea77b15f..0c155cbde 100644 --- a/python/tests/test_imports.py +++ b/python/tests/test_imports.py @@ -15,72 +15,69 @@ # specific language governing permissions and limitations # under the License. -import pytest - import datafusion +import pytest from datafusion import ( AggregateUDF, DataFrame, - SessionContext, ScalarUDF, + SessionContext, functions, ) - from datafusion.common import ( DFSchema, ) - from datafusion.expr import ( - Expr, - Column, - Literal, - BinaryExpr, - AggregateFunction, - Projection, - TableScan, - Filter, - Limit, Aggregate, - Sort, - Analyze, - Join, - JoinType, - JoinConstraint, - Union, - Like, - ILike, - SimilarTo, - ScalarVariable, + AggregateFunction, Alias, - Not, - IsNotNull, - IsTrue, - IsFalse, - IsUnknown, - IsNotTrue, - IsNotFalse, - IsNotUnknown, - Negative, - InList, - Exists, - Subquery, - InSubquery, - ScalarSubquery, - GroupingSet, - Placeholder, + Analyze, + Between, + BinaryExpr, Case, Cast, - TryCast, - SubqueryAlias, - Between, - Explain, - Extension, + Column, CreateMemoryTable, CreateView, Distinct, DropTable, - Repartition, + Exists, + Explain, + Expr, + Extension, + Filter, + GroupingSet, + ILike, + InList, + InSubquery, + IsFalse, + IsNotFalse, + IsNotNull, + IsNotTrue, + IsNotUnknown, + IsTrue, + IsUnknown, + Join, + JoinConstraint, + JoinType, + Like, + Limit, + Literal, + Negative, + Not, Partitioning, + Placeholder, + Projection, + Repartition, + ScalarSubquery, + ScalarVariable, + SimilarTo, + Sort, + Subquery, + SubqueryAlias, + TableScan, + TryCast, + Union, ) diff --git a/python/tests/test_indexing.py b/python/tests/test_indexing.py index 8ca3eab19..5b0d08610 100644 --- a/python/tests/test_indexing.py +++ b/python/tests/test_indexing.py @@ -17,7 +17,6 @@ import pyarrow as pa import pytest - from datafusion import SessionContext diff --git a/python/tests/test_input.py b/python/tests/test_input.py index fb53d86e5..806471357 100644 --- a/python/tests/test_input.py +++ b/python/tests/test_input.py @@ -16,6 +16,7 @@ # under the License. import os + from datafusion.input.location import LocationInputPlugin diff --git a/python/tests/test_plans.py b/python/tests/test_plans.py index 0283a4e6a..396acbe97 100644 --- a/python/tests/test_plans.py +++ b/python/tests/test_plans.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -from datafusion import SessionContext, LogicalPlan, ExecutionPlan import pytest +from datafusion import ExecutionPlan, LogicalPlan, SessionContext # Note: We must use CSV because memory tables are currently not supported for diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index a2521dd09..862f745bf 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -19,12 +19,11 @@ import numpy as np import pyarrow as pa -from pyarrow.csv import write_csv import pyarrow.dataset as ds import pytest +from datafusion import col, udf from datafusion.object_store import Http - -from datafusion import udf, col +from pyarrow.csv import write_csv from . import generic as helpers diff --git a/python/tests/test_store.py b/python/tests/test_store.py index f85b28311..53ffc3acf 100644 --- a/python/tests/test_store.py +++ b/python/tests/test_store.py @@ -18,7 +18,6 @@ import os import pytest - from datafusion import SessionContext diff --git a/python/tests/test_substrait.py b/python/tests/test_substrait.py index 2071c8f3b..feada7cde 100644 --- a/python/tests/test_substrait.py +++ b/python/tests/test_substrait.py @@ -16,10 +16,9 @@ # under the License. import pyarrow as pa - +import pytest from datafusion import SessionContext from datafusion import substrait as ss -import pytest @pytest.fixture diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index 8f31748e0..0005a3da8 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -20,7 +20,6 @@ import pyarrow as pa import pyarrow.compute as pc import pytest - from datafusion import Accumulator, column, udaf diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py index 568a66dbb..3a5dce6d6 100644 --- a/python/tests/test_udf.py +++ b/python/tests/test_udf.py @@ -15,9 +15,9 @@ # specific language governing permissions and limitations # under the License. -from datafusion import udf, column import pyarrow as pa import pytest +from datafusion import column, udf @pytest.fixture diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 2099ac9bc..0ffa04179 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -17,10 +17,10 @@ import pyarrow as pa import pytest - -from datafusion import SessionContext, column, udwf, lit, functions as f -from datafusion.udf import WindowEvaluator +from datafusion import SessionContext, column, lit, udwf +from datafusion import functions as f from datafusion.expr import WindowFrame +from datafusion.udf import WindowEvaluator class ExponentialSmoothDefault(WindowEvaluator): From 8b513906315a0749b9f5cd6f34bf259ab4dd1add Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 1 Feb 2025 08:29:48 -0500 Subject: [PATCH 093/248] feat: remove DataFusion pyarrow feat (#1000) * Add developer instructions to speed up build processes * Remove pyarrow dep from datafusion. Add in PyScalarValue wrapper and rename DataFusionError to PyDataFusionError to be less confusing * Removed unnecessary cloning of scalar value when going from rust to python. Also removed the rust unit tests copied over from upstream repo that were failing due to #941 in pyo3 * Change return types to PyDataFusionError to simplify code * Update exception handling to fix build errors in recent rust toolchains --- Cargo.lock | 145 +++++++++++------- Cargo.toml | 2 +- .../source/contributor-guide/introduction.rst | 53 +++++++ python/tests/test_indexing.py | 3 +- src/catalog.rs | 8 +- src/common/data_type.rs | 14 ++ src/config.rs | 11 +- src/context.rs | 136 ++++++++-------- src/dataframe.rs | 119 +++++++------- src/dataset_exec.rs | 6 +- src/errors.rs | 42 ++--- src/expr.rs | 38 ++--- src/expr/conditional_expr.rs | 6 +- src/expr/literal.rs | 4 +- src/expr/window.rs | 13 +- src/functions.rs | 55 ++++--- src/lib.rs | 1 + src/physical_plan.rs | 13 +- src/pyarrow_filter_expression.rs | 24 +-- src/pyarrow_util.rs | 61 ++++++++ src/record_batch.rs | 3 +- src/sql/exceptions.rs | 16 +- src/sql/logical.rs | 14 +- src/substrait.rs | 54 ++++--- src/udaf.rs | 21 ++- src/udwf.rs | 4 +- src/utils.rs | 6 +- 27 files changed, 524 insertions(+), 348 deletions(-) create mode 100644 src/pyarrow_util.rs diff --git a/Cargo.lock b/Cargo.lock index 5a74a4839..c6590fd21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,7 +79,7 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.2.15", "once_cell", "version_check", "zerocopy", @@ -449,9 +449,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.85" +version = "0.1.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" +checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" dependencies = [ "proc-macro2", "quote", @@ -576,9 +576,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.1" +version = "4.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -586,9 +586,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "byteorder" @@ -635,9 +635,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.10" +version = "1.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229" +checksum = "e4730490333d58093109dc02c23174c3f4d490998c3fed3cc8e82d57afedb9cf" dependencies = [ "jobserver", "libc", @@ -692,9 +692,9 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.52" +version = "0.1.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c682c223677e0e5b6b7f63a64b9351844c3f1b1678a68b7ee617e30fb082620e" +checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6" dependencies = [ "cc", ] @@ -725,7 +725,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "tiny-keccak", ] @@ -784,9 +784,9 @@ checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6" [[package]] name = "cpufeatures" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] @@ -817,9 +817,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" @@ -961,7 +961,6 @@ dependencies = [ "object_store", "parquet", "paste", - "pyo3", "recursive", "sqlparser", "tokio", @@ -1411,9 +1410,9 @@ dependencies = [ [[package]] name = "dyn-clone" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" +checksum = "feeef44e73baff3a26d371801df019877a9866a8c493d315ab00177843314f35" [[package]] name = "either" @@ -1607,10 +1606,22 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets", +] + [[package]] name = "gimli" version = "0.31.1" @@ -1722,9 +1733,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" [[package]] name = "humantime" @@ -1734,9 +1745,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.5.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -1953,9 +1964,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -1975,9 +1986,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.10.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "itertools" @@ -2243,7 +2254,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -2377,9 +2388,9 @@ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "openssl-probe" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "ordered-float" @@ -2661,9 +2672,9 @@ dependencies = [ [[package]] name = "protobuf-src" -version = "2.1.0+27.1" +version = "2.1.1+27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7edafa3bcc668fa93efafcbdf58d7821bbda0f4b458ac7fae3d57ec0fec8167" +checksum = "6217c3504da19b85a3a4b2e9a5183d635822d83507ba0986624b5c05b83bfc40" dependencies = [ "cmake", ] @@ -2794,7 +2805,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", - "getrandom", + "getrandom 0.2.15", "rand", "ring", "rustc-hash", @@ -2857,7 +2868,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -2926,9 +2937,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "regress" -version = "0.10.2" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f56e622c2378013c6c61e2bd776604c46dc1087b2dc5293275a0c20a44f0771" +checksum = "78ef7fa9ed0256d64a688a3747d0fef7a88851c18a5e1d57f115f38ec2e09366" dependencies = [ "hashbrown 0.15.2", "memchr", @@ -2997,7 +3008,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom", + "getrandom 0.2.15", "libc", "spin", "untrusted", @@ -3033,9 +3044,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.43" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ "bitflags 2.8.0", "errno", @@ -3046,9 +3057,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.21" +version = "0.23.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8" +checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" dependencies = [ "once_cell", "ring", @@ -3081,9 +3092,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" dependencies = [ "web-time", ] @@ -3107,9 +3118,9 @@ checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "same-file" @@ -3184,9 +3195,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" dependencies = [ "serde", ] @@ -3239,9 +3250,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.136" +version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "336a0c23cf42a38d9eaa7cd22c7040d04e1228a19a933890805ffd00a16437d2" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ "itoa", "memchr", @@ -3514,13 +3525,13 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.15.0" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" +checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" dependencies = [ "cfg-if", "fastrand", - "getrandom", + "getrandom 0.3.1", "once_cell", "rustix", "windows-sys 0.59.0", @@ -3831,9 +3842,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "unicode-segmentation" @@ -3890,11 +3901,11 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744018581f9a3454a9e15beb8a33b017183f1e7c0cd170232a2d1453b23a51c4" +checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" dependencies = [ - "getrandom", + "getrandom 0.2.15", "serde", ] @@ -3929,6 +3940,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -4185,6 +4205,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags 2.8.0", +] + [[package]] name = "write16" version = "1.0.0" diff --git a/Cargo.toml b/Cargo.toml index 10cffccb1..003ba36e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]} arrow = { version = "53", features = ["pyarrow"] } -datafusion = { version = "44.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } +datafusion = { version = "44.0.0", features = ["avro", "unicode_expressions"] } datafusion-substrait = { version = "44.0.0", optional = true } datafusion-proto = { version = "44.0.0" } datafusion-ffi = { version = "44.0.0" } diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst index fb98cfd1d..25f2c21a4 100644 --- a/docs/source/contributor-guide/introduction.rst +++ b/docs/source/contributor-guide/introduction.rst @@ -95,3 +95,56 @@ To update dependencies, run .. code-block:: shell uv sync --dev --no-install-package datafusion + +Improving Build Speed +--------------------- + +The `pyo3 `_ dependency of this project contains a ``build.rs`` file which +can cause it to rebuild frequently. You can prevent this from happening by defining a ``PYO3_CONFIG_FILE`` +environment variable that points to a file with your build configuration. Whenever your build configuration +changes, such as during some major version updates, you will need to regenerate this file. This variable +should point to a fully resolved path on your build machine. + +To generate this file, use the following command: + +.. code-block:: shell + + PYO3_PRINT_CONFIG=1 cargo build + +This will generate some output that looks like the following. You will want to copy these contents intro +a file. If you place this file in your project directory with filename ``.pyo3_build_config`` it will +be ignored by ``git``. + +.. code-block:: + + implementation=CPython + version=3.8 + shared=true + abi3=true + lib_name=python3.12 + lib_dir=/opt/homebrew/opt/python@3.12/Frameworks/Python.framework/Versions/3.12/lib + executable=/Users/myusername/src/datafusion-python/.venv/bin/python + pointer_width=64 + build_flags= + suppress_build_script_link_lines=false + +Add the environment variable to your system. + +.. code-block:: shell + + export PYO3_CONFIG_FILE="/Users//myusername/src/datafusion-python/.pyo3_build_config" + +If you are on a Mac and you use VS Code for your IDE, you will want to add these variables +to your settings. You can find the appropriate rust flags by looking in the +``.cargo/config.toml`` file. + +.. code-block:: + + "rust-analyzer.cargo.extraEnv": { + "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup", + "PYO3_CONFIG_FILE": "/Users/myusername/src/datafusion-python/.pyo3_build_config" + }, + "rust-analyzer.runnables.extraEnv": { + "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup", + "PYO3_CONFIG_FILE": "/Users/myusername/src/personal/datafusion-python/.pyo3_build_config" + } diff --git a/python/tests/test_indexing.py b/python/tests/test_indexing.py index 5b0d08610..327decd2f 100644 --- a/python/tests/test_indexing.py +++ b/python/tests/test_indexing.py @@ -43,7 +43,8 @@ def test_err(df): with pytest.raises(Exception) as e_info: df["c"] - assert "Schema error: No field named c." in e_info.value.args[0] + for e in ["SchemaError", "FieldNotFound", 'name: "c"']: + assert e in e_info.value.args[0] with pytest.raises(Exception) as e_info: df[1] diff --git a/src/catalog.rs b/src/catalog.rs index 1ce66a4dc..1e189a5aa 100644 --- a/src/catalog.rs +++ b/src/catalog.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use pyo3::exceptions::PyKeyError; use pyo3::prelude::*; -use crate::errors::DataFusionError; +use crate::errors::{PyDataFusionError, PyDataFusionResult}; use crate::utils::wait_for_future; use datafusion::{ arrow::pyarrow::ToPyArrow, @@ -96,11 +96,13 @@ impl PyDatabase { self.database.table_names().into_iter().collect() } - fn table(&self, name: &str, py: Python) -> PyResult { + fn table(&self, name: &str, py: Python) -> PyDataFusionResult { if let Some(table) = wait_for_future(py, self.database.table(name))? { Ok(PyTable::new(table)) } else { - Err(DataFusionError::Common(format!("Table not found: {name}")).into()) + Err(PyDataFusionError::Common(format!( + "Table not found: {name}" + ))) } } diff --git a/src/common/data_type.rs b/src/common/data_type.rs index 7f9c75bfd..f5f8a6b06 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -23,6 +23,20 @@ use pyo3::{exceptions::PyValueError, prelude::*}; use crate::errors::py_datafusion_err; +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub struct PyScalarValue(pub ScalarValue); + +impl From for PyScalarValue { + fn from(value: ScalarValue) -> Self { + Self(value) + } +} +impl From for ScalarValue { + fn from(value: PyScalarValue) -> Self { + value.0 + } +} + #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] #[pyclass(eq, eq_int, name = "RexType", module = "datafusion.common")] pub enum RexType { diff --git a/src/config.rs b/src/config.rs index 3f2a05580..cc725b9a3 100644 --- a/src/config.rs +++ b/src/config.rs @@ -21,6 +21,8 @@ use pyo3::types::*; use datafusion::common::ScalarValue; use datafusion::config::ConfigOptions; +use crate::errors::PyDataFusionResult; + #[pyclass(name = "Config", module = "datafusion", subclass)] #[derive(Clone)] pub(crate) struct PyConfig { @@ -38,7 +40,7 @@ impl PyConfig { /// Get configurations from environment variables #[staticmethod] - pub fn from_env() -> PyResult { + pub fn from_env() -> PyDataFusionResult { Ok(Self { config: ConfigOptions::from_env()?, }) @@ -56,11 +58,10 @@ impl PyConfig { } /// Set a configuration option - pub fn set(&mut self, key: &str, value: PyObject, py: Python) -> PyResult<()> { + pub fn set(&mut self, key: &str, value: PyObject, py: Python) -> PyDataFusionResult<()> { let scalar_value = py_obj_to_scalar_value(py, value); - self.config - .set(key, scalar_value.to_string().as_str()) - .map_err(|e| e.into()) + self.config.set(key, scalar_value.to_string().as_str())?; + Ok(()) } /// Get all configuration options diff --git a/src/context.rs b/src/context.rs index bab7fd42a..f53b15576 100644 --- a/src/context.rs +++ b/src/context.rs @@ -28,16 +28,17 @@ use object_store::ObjectStore; use url::Url; use uuid::Uuid; -use pyo3::exceptions::{PyKeyError, PyNotImplementedError, PyTypeError, PyValueError}; +use pyo3::exceptions::{PyKeyError, PyValueError}; use pyo3::prelude::*; use crate::catalog::{PyCatalog, PyTable}; use crate::dataframe::PyDataFrame; use crate::dataset::Dataset; -use crate::errors::{py_datafusion_err, DataFusionError}; +use crate::errors::{py_datafusion_err, PyDataFusionResult}; use crate::expr::sort_expr::PySortExpr; use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; +use crate::sql::exceptions::py_value_err; use crate::sql::logical::PyLogicalPlan; use crate::store::StorageContexts; use crate::udaf::PyAggregateUDF; @@ -277,7 +278,7 @@ impl PySessionContext { pub fn new( config: Option, runtime: Option, - ) -> PyResult { + ) -> PyDataFusionResult { let config = if let Some(c) = config { c.config } else { @@ -348,7 +349,7 @@ impl PySessionContext { schema: Option>, file_sort_order: Option>>, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let options = ListingOptions::new(Arc::new(ParquetFormat::new())) .with_file_extension(file_extension) .with_table_partition_cols(convert_table_partition_cols(table_partition_cols)?) @@ -365,7 +366,7 @@ impl PySessionContext { None => { let state = self.ctx.state(); let schema = options.infer_schema(&state, &table_path); - wait_for_future(py, schema).map_err(DataFusionError::from)? + wait_for_future(py, schema)? } }; let config = ListingTableConfig::new(table_path) @@ -382,9 +383,9 @@ impl PySessionContext { } /// Returns a PyDataFrame whose plan corresponds to the SQL statement. - pub fn sql(&mut self, query: &str, py: Python) -> PyResult { + pub fn sql(&mut self, query: &str, py: Python) -> PyDataFusionResult { let result = self.ctx.sql(query); - let df = wait_for_future(py, result).map_err(DataFusionError::from)?; + let df = wait_for_future(py, result)?; Ok(PyDataFrame::new(df)) } @@ -394,14 +395,14 @@ impl PySessionContext { query: &str, options: Option, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let options = if let Some(options) = options { options.options } else { SQLOptions::new() }; let result = self.ctx.sql_with_options(query, options); - let df = wait_for_future(py, result).map_err(DataFusionError::from)?; + let df = wait_for_future(py, result)?; Ok(PyDataFrame::new(df)) } @@ -412,14 +413,14 @@ impl PySessionContext { name: Option<&str>, schema: Option>, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let schema = if let Some(schema) = schema { SchemaRef::from(schema.0) } else { partitions.0[0][0].schema() }; - let table = MemTable::try_new(schema, partitions.0).map_err(DataFusionError::from)?; + let table = MemTable::try_new(schema, partitions.0)?; // generate a random (unique) name for this table if none is provided // table name cannot start with numeric digit @@ -433,11 +434,9 @@ impl PySessionContext { } }; - self.ctx - .register_table(&*table_name, Arc::new(table)) - .map_err(DataFusionError::from)?; + self.ctx.register_table(&*table_name, Arc::new(table))?; - let table = wait_for_future(py, self._table(&table_name)).map_err(DataFusionError::from)?; + let table = wait_for_future(py, self._table(&table_name))?; let df = PyDataFrame::new(table); Ok(df) @@ -495,15 +494,14 @@ impl PySessionContext { data: Bound<'_, PyAny>, name: Option<&str>, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let (schema, batches) = if let Ok(stream_reader) = ArrowArrayStreamReader::from_pyarrow_bound(&data) { // Works for any object that implements __arrow_c_stream__ in pycapsule. let schema = stream_reader.schema().as_ref().to_owned(); let batches = stream_reader - .collect::, arrow::error::ArrowError>>() - .map_err(DataFusionError::from)?; + .collect::, arrow::error::ArrowError>>()?; (schema, batches) } else if let Ok(array) = RecordBatch::from_pyarrow_bound(&data) { @@ -512,8 +510,8 @@ impl PySessionContext { (array.schema().as_ref().to_owned(), vec![array]) } else { - return Err(PyTypeError::new_err( - "Expected either a Arrow Array or Arrow Stream in from_arrow().", + return Err(crate::errors::PyDataFusionError::Common( + "Expected either a Arrow Array or Arrow Stream in from_arrow().".to_string(), )); }; @@ -559,17 +557,13 @@ impl PySessionContext { Ok(df) } - pub fn register_table(&mut self, name: &str, table: &PyTable) -> PyResult<()> { - self.ctx - .register_table(name, table.table()) - .map_err(DataFusionError::from)?; + pub fn register_table(&mut self, name: &str, table: &PyTable) -> PyDataFusionResult<()> { + self.ctx.register_table(name, table.table())?; Ok(()) } - pub fn deregister_table(&mut self, name: &str) -> PyResult<()> { - self.ctx - .deregister_table(name) - .map_err(DataFusionError::from)?; + pub fn deregister_table(&mut self, name: &str) -> PyDataFusionResult<()> { + self.ctx.deregister_table(name)?; Ok(()) } @@ -578,10 +572,10 @@ impl PySessionContext { &mut self, name: &str, provider: Bound<'_, PyAny>, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { if provider.hasattr("__datafusion_table_provider__")? { let capsule = provider.getattr("__datafusion_table_provider__")?.call0()?; - let capsule = capsule.downcast::()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; validate_pycapsule(capsule, "datafusion_table_provider")?; let provider = unsafe { capsule.reference::() }; @@ -591,8 +585,9 @@ impl PySessionContext { Ok(()) } else { - Err(PyNotImplementedError::new_err( - "__datafusion_table_provider__ does not exist on Table Provider object.", + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_table_provider__ does not exist on Table Provider object." + .to_string(), )) } } @@ -601,12 +596,10 @@ impl PySessionContext { &mut self, name: &str, partitions: PyArrowType>>, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let schema = partitions.0[0][0].schema(); let table = MemTable::try_new(schema, partitions.0)?; - self.ctx - .register_table(name, Arc::new(table)) - .map_err(DataFusionError::from)?; + self.ctx.register_table(name, Arc::new(table))?; Ok(()) } @@ -628,7 +621,7 @@ impl PySessionContext { schema: Option>, file_sort_order: Option>>, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let mut options = ParquetReadOptions::default() .table_partition_cols(convert_table_partition_cols(table_partition_cols)?) .parquet_pruning(parquet_pruning) @@ -642,7 +635,7 @@ impl PySessionContext { .collect(); let result = self.ctx.register_parquet(name, path, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; Ok(()) } @@ -666,12 +659,12 @@ impl PySessionContext { file_extension: &str, file_compression_type: Option, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let delimiter = delimiter.as_bytes(); if delimiter.len() != 1 { - return Err(PyValueError::new_err( + return Err(crate::errors::PyDataFusionError::PythonError(py_value_err( "Delimiter must be a single character", - )); + ))); } let mut options = CsvReadOptions::new() @@ -685,11 +678,11 @@ impl PySessionContext { if path.is_instance_of::() { let paths = path.extract::>()?; let result = self.register_csv_from_multiple_paths(name, paths, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; } else { let path = path.extract::()?; let result = self.ctx.register_csv(name, &path, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; } Ok(()) @@ -713,7 +706,7 @@ impl PySessionContext { table_partition_cols: Vec<(String, String)>, file_compression_type: Option, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let path = path .to_str() .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; @@ -726,7 +719,7 @@ impl PySessionContext { options.schema = schema.as_ref().map(|x| &x.0); let result = self.ctx.register_json(name, path, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; Ok(()) } @@ -745,7 +738,7 @@ impl PySessionContext { file_extension: &str, table_partition_cols: Vec<(String, String)>, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let path = path .to_str() .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; @@ -756,7 +749,7 @@ impl PySessionContext { options.schema = schema.as_ref().map(|x| &x.0); let result = self.ctx.register_avro(name, path, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; Ok(()) } @@ -767,12 +760,10 @@ impl PySessionContext { name: &str, dataset: &Bound<'_, PyAny>, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let table: Arc = Arc::new(Dataset::new(dataset, py)?); - self.ctx - .register_table(name, table) - .map_err(DataFusionError::from)?; + self.ctx.register_table(name, table)?; Ok(()) } @@ -824,11 +815,11 @@ impl PySessionContext { Ok(PyDataFrame::new(x)) } - pub fn table_exist(&self, name: &str) -> PyResult { + pub fn table_exist(&self, name: &str) -> PyDataFusionResult { Ok(self.ctx.table_exist(name)?) } - pub fn empty_table(&self) -> PyResult { + pub fn empty_table(&self) -> PyDataFusionResult { Ok(PyDataFrame::new(self.ctx.read_empty()?)) } @@ -847,7 +838,7 @@ impl PySessionContext { table_partition_cols: Vec<(String, String)>, file_compression_type: Option, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let path = path .to_str() .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; @@ -859,10 +850,10 @@ impl PySessionContext { let df = if let Some(schema) = schema { options.schema = Some(&schema.0); let result = self.ctx.read_json(path, options); - wait_for_future(py, result).map_err(DataFusionError::from)? + wait_for_future(py, result)? } else { let result = self.ctx.read_json(path, options); - wait_for_future(py, result).map_err(DataFusionError::from)? + wait_for_future(py, result)? }; Ok(PyDataFrame::new(df)) } @@ -888,12 +879,12 @@ impl PySessionContext { table_partition_cols: Vec<(String, String)>, file_compression_type: Option, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let delimiter = delimiter.as_bytes(); if delimiter.len() != 1 { - return Err(PyValueError::new_err( + return Err(crate::errors::PyDataFusionError::PythonError(py_value_err( "Delimiter must be a single character", - )); + ))); }; let mut options = CsvReadOptions::new() @@ -909,12 +900,12 @@ impl PySessionContext { let paths = path.extract::>()?; let paths = paths.iter().map(|p| p as &str).collect::>(); let result = self.ctx.read_csv(paths, options); - let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?); + let df = PyDataFrame::new(wait_for_future(py, result)?); Ok(df) } else { let path = path.extract::()?; let result = self.ctx.read_csv(path, options); - let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?); + let df = PyDataFrame::new(wait_for_future(py, result)?); Ok(df) } } @@ -938,7 +929,7 @@ impl PySessionContext { schema: Option>, file_sort_order: Option>>, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let mut options = ParquetReadOptions::default() .table_partition_cols(convert_table_partition_cols(table_partition_cols)?) .parquet_pruning(parquet_pruning) @@ -952,7 +943,7 @@ impl PySessionContext { .collect(); let result = self.ctx.read_parquet(path, options); - let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?); + let df = PyDataFrame::new(wait_for_future(py, result)?); Ok(df) } @@ -965,26 +956,23 @@ impl PySessionContext { table_partition_cols: Vec<(String, String)>, file_extension: &str, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let mut options = AvroReadOptions::default() .table_partition_cols(convert_table_partition_cols(table_partition_cols)?); options.file_extension = file_extension; let df = if let Some(schema) = schema { options.schema = Some(&schema.0); let read_future = self.ctx.read_avro(path, options); - wait_for_future(py, read_future).map_err(DataFusionError::from)? + wait_for_future(py, read_future)? } else { let read_future = self.ctx.read_avro(path, options); - wait_for_future(py, read_future).map_err(DataFusionError::from)? + wait_for_future(py, read_future)? }; Ok(PyDataFrame::new(df)) } - pub fn read_table(&self, table: &PyTable) -> PyResult { - let df = self - .ctx - .read_table(table.table()) - .map_err(DataFusionError::from)?; + pub fn read_table(&self, table: &PyTable) -> PyDataFusionResult { + let df = self.ctx.read_table(table.table())?; Ok(PyDataFrame::new(df)) } @@ -1011,7 +999,7 @@ impl PySessionContext { plan: PyExecutionPlan, part: usize, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let ctx: TaskContext = TaskContext::from(&self.ctx.state()); // create a Tokio runtime to run the async code let rt = &get_tokio_runtime().0; @@ -1071,13 +1059,13 @@ impl PySessionContext { pub fn convert_table_partition_cols( table_partition_cols: Vec<(String, String)>, -) -> Result, DataFusionError> { +) -> PyDataFusionResult> { table_partition_cols .into_iter() .map(|(name, ty)| match ty.as_str() { "string" => Ok((name, DataType::Utf8)), "int" => Ok((name, DataType::Int32)), - _ => Err(DataFusionError::Common(format!( + _ => Err(crate::errors::PyDataFusionError::Common(format!( "Unsupported data type '{ty}' for partition column. Supported types are 'string' and 'int'" ))), }) diff --git a/src/dataframe.rs b/src/dataframe.rs index b875480a7..6fb08ba25 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -33,20 +33,20 @@ use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; -use pyo3::exceptions::{PyTypeError, PyValueError}; +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; use tokio::task::JoinHandle; -use crate::errors::py_datafusion_err; +use crate::errors::{py_datafusion_err, PyDataFusionError}; use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; use crate::{ - errors::DataFusionError, + errors::PyDataFusionResult, expr::{sort_expr::PySortExpr, PyExpr}, }; @@ -69,7 +69,7 @@ impl PyDataFrame { #[pymethods] impl PyDataFrame { /// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]` - fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyResult { + fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult { if let Ok(key) = key.extract::() { // df[col] self.select_columns(vec![key]) @@ -84,12 +84,12 @@ impl PyDataFrame { // df[[col1, col2, col3]] self.select_columns(keys) } else { - let message = "DataFrame can only be indexed by string index or indices"; - Err(PyTypeError::new_err(message)) + let message = "DataFrame can only be indexed by string index or indices".to_string(); + Err(PyDataFusionError::Common(message)) } } - fn __repr__(&self, py: Python) -> PyResult { + fn __repr__(&self, py: Python) -> PyDataFusionResult { let df = self.df.as_ref().clone().limit(0, Some(10))?; let batches = wait_for_future(py, df.collect())?; let batches_as_string = pretty::pretty_format_batches(&batches); @@ -99,7 +99,7 @@ impl PyDataFrame { } } - fn _repr_html_(&self, py: Python) -> PyResult { + fn _repr_html_(&self, py: Python) -> PyDataFusionResult { let mut html_str = "\n".to_string(); let df = self.df.as_ref().clone().limit(0, Some(10))?; @@ -145,7 +145,7 @@ impl PyDataFrame { } /// Calculate summary statistics for a DataFrame - fn describe(&self, py: Python) -> PyResult { + fn describe(&self, py: Python) -> PyDataFusionResult { let df = self.df.as_ref().clone(); let stat_df = wait_for_future(py, df.describe())?; Ok(Self::new(stat_df)) @@ -157,37 +157,37 @@ impl PyDataFrame { } #[pyo3(signature = (*args))] - fn select_columns(&self, args: Vec) -> PyResult { + fn select_columns(&self, args: Vec) -> PyDataFusionResult { let args = args.iter().map(|s| s.as_ref()).collect::>(); let df = self.df.as_ref().clone().select_columns(&args)?; Ok(Self::new(df)) } #[pyo3(signature = (*args))] - fn select(&self, args: Vec) -> PyResult { + fn select(&self, args: Vec) -> PyDataFusionResult { let expr = args.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().select(expr)?; Ok(Self::new(df)) } #[pyo3(signature = (*args))] - fn drop(&self, args: Vec) -> PyResult { + fn drop(&self, args: Vec) -> PyDataFusionResult { let cols = args.iter().map(|s| s.as_ref()).collect::>(); let df = self.df.as_ref().clone().drop_columns(&cols)?; Ok(Self::new(df)) } - fn filter(&self, predicate: PyExpr) -> PyResult { + fn filter(&self, predicate: PyExpr) -> PyDataFusionResult { let df = self.df.as_ref().clone().filter(predicate.into())?; Ok(Self::new(df)) } - fn with_column(&self, name: &str, expr: PyExpr) -> PyResult { + fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult { let df = self.df.as_ref().clone().with_column(name, expr.into())?; Ok(Self::new(df)) } - fn with_columns(&self, exprs: Vec) -> PyResult { + fn with_columns(&self, exprs: Vec) -> PyDataFusionResult { let mut df = self.df.as_ref().clone(); for expr in exprs { let expr: Expr = expr.into(); @@ -199,7 +199,7 @@ impl PyDataFrame { /// Rename one column by applying a new projection. This is a no-op if the column to be /// renamed does not exist. - fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyResult { + fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyDataFusionResult { let df = self .df .as_ref() @@ -208,7 +208,7 @@ impl PyDataFrame { Ok(Self::new(df)) } - fn aggregate(&self, group_by: Vec, aggs: Vec) -> PyResult { + fn aggregate(&self, group_by: Vec, aggs: Vec) -> PyDataFusionResult { let group_by = group_by.into_iter().map(|e| e.into()).collect(); let aggs = aggs.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().aggregate(group_by, aggs)?; @@ -216,14 +216,14 @@ impl PyDataFrame { } #[pyo3(signature = (*exprs))] - fn sort(&self, exprs: Vec) -> PyResult { + fn sort(&self, exprs: Vec) -> PyDataFusionResult { let exprs = to_sort_expressions(exprs); let df = self.df.as_ref().clone().sort(exprs)?; Ok(Self::new(df)) } #[pyo3(signature = (count, offset=0))] - fn limit(&self, count: usize, offset: usize) -> PyResult { + fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult { let df = self.df.as_ref().clone().limit(offset, Some(count))?; Ok(Self::new(df)) } @@ -232,14 +232,15 @@ impl PyDataFrame { /// Unless some order is specified in the plan, there is no /// guarantee of the order of the result. fn collect(&self, py: Python) -> PyResult> { - let batches = wait_for_future(py, self.df.as_ref().clone().collect())?; + let batches = wait_for_future(py, self.df.as_ref().clone().collect()) + .map_err(PyDataFusionError::from)?; // cannot use PyResult> return type due to // https://github.com/PyO3/pyo3/issues/1813 batches.into_iter().map(|rb| rb.to_pyarrow(py)).collect() } /// Cache DataFrame. - fn cache(&self, py: Python) -> PyResult { + fn cache(&self, py: Python) -> PyDataFusionResult { let df = wait_for_future(py, self.df.as_ref().clone().cache())?; Ok(Self::new(df)) } @@ -247,7 +248,8 @@ impl PyDataFrame { /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch /// maintaining the input partitioning. fn collect_partitioned(&self, py: Python) -> PyResult>> { - let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned())?; + let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned()) + .map_err(PyDataFusionError::from)?; batches .into_iter() @@ -257,13 +259,13 @@ impl PyDataFrame { /// Print the result, 20 lines by default #[pyo3(signature = (num=20))] - fn show(&self, py: Python, num: usize) -> PyResult<()> { + fn show(&self, py: Python, num: usize) -> PyDataFusionResult<()> { let df = self.df.as_ref().clone().limit(0, Some(num))?; print_dataframe(py, df) } /// Filter out duplicate rows - fn distinct(&self) -> PyResult { + fn distinct(&self) -> PyDataFusionResult { let df = self.df.as_ref().clone().distinct()?; Ok(Self::new(df)) } @@ -274,7 +276,7 @@ impl PyDataFrame { how: &str, left_on: Vec, right_on: Vec, - ) -> PyResult { + ) -> PyDataFusionResult { let join_type = match how { "inner" => JoinType::Inner, "left" => JoinType::Left, @@ -283,10 +285,9 @@ impl PyDataFrame { "semi" => JoinType::LeftSemi, "anti" => JoinType::LeftAnti, how => { - return Err(DataFusionError::Common(format!( + return Err(PyDataFusionError::Common(format!( "The join type {how} does not exist or is not implemented" - )) - .into()); + ))); } }; @@ -303,7 +304,12 @@ impl PyDataFrame { Ok(Self::new(df)) } - fn join_on(&self, right: PyDataFrame, on_exprs: Vec, how: &str) -> PyResult { + fn join_on( + &self, + right: PyDataFrame, + on_exprs: Vec, + how: &str, + ) -> PyDataFusionResult { let join_type = match how { "inner" => JoinType::Inner, "left" => JoinType::Left, @@ -312,10 +318,9 @@ impl PyDataFrame { "semi" => JoinType::LeftSemi, "anti" => JoinType::LeftAnti, how => { - return Err(DataFusionError::Common(format!( + return Err(PyDataFusionError::Common(format!( "The join type {how} does not exist or is not implemented" - )) - .into()); + ))); } }; let exprs: Vec = on_exprs.into_iter().map(|e| e.into()).collect(); @@ -330,7 +335,7 @@ impl PyDataFrame { /// Print the query plan #[pyo3(signature = (verbose=false, analyze=false))] - fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyResult<()> { + fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyDataFusionResult<()> { let df = self.df.as_ref().clone().explain(verbose, analyze)?; print_dataframe(py, df) } @@ -341,18 +346,18 @@ impl PyDataFrame { } /// Get the optimized logical plan for this `DataFrame` - fn optimized_logical_plan(&self) -> PyResult { + fn optimized_logical_plan(&self) -> PyDataFusionResult { Ok(self.df.as_ref().clone().into_optimized_plan()?.into()) } /// Get the execution plan for this `DataFrame` - fn execution_plan(&self, py: Python) -> PyResult { + fn execution_plan(&self, py: Python) -> PyDataFusionResult { let plan = wait_for_future(py, self.df.as_ref().clone().create_physical_plan())?; Ok(plan.into()) } /// Repartition a `DataFrame` based on a logical partitioning scheme. - fn repartition(&self, num: usize) -> PyResult { + fn repartition(&self, num: usize) -> PyDataFusionResult { let new_df = self .df .as_ref() @@ -363,7 +368,7 @@ impl PyDataFrame { /// Repartition a `DataFrame` based on a logical partitioning scheme. #[pyo3(signature = (*args, num))] - fn repartition_by_hash(&self, args: Vec, num: usize) -> PyResult { + fn repartition_by_hash(&self, args: Vec, num: usize) -> PyDataFusionResult { let expr = args.into_iter().map(|py_expr| py_expr.into()).collect(); let new_df = self .df @@ -376,7 +381,7 @@ impl PyDataFrame { /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The /// two `DataFrame`s must have exactly the same schema #[pyo3(signature = (py_df, distinct=false))] - fn union(&self, py_df: PyDataFrame, distinct: bool) -> PyResult { + fn union(&self, py_df: PyDataFrame, distinct: bool) -> PyDataFusionResult { let new_df = if distinct { self.df .as_ref() @@ -391,7 +396,7 @@ impl PyDataFrame { /// Calculate the distinct union of two `DataFrame`s. The /// two `DataFrame`s must have exactly the same schema - fn union_distinct(&self, py_df: PyDataFrame) -> PyResult { + fn union_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self .df .as_ref() @@ -401,7 +406,7 @@ impl PyDataFrame { } #[pyo3(signature = (column, preserve_nulls=true))] - fn unnest_column(&self, column: &str, preserve_nulls: bool) -> PyResult { + fn unnest_column(&self, column: &str, preserve_nulls: bool) -> PyDataFusionResult { // TODO: expose RecursionUnnestOptions // REF: https://github.com/apache/datafusion/pull/11577 let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls); @@ -414,7 +419,11 @@ impl PyDataFrame { } #[pyo3(signature = (columns, preserve_nulls=true))] - fn unnest_columns(&self, columns: Vec, preserve_nulls: bool) -> PyResult { + fn unnest_columns( + &self, + columns: Vec, + preserve_nulls: bool, + ) -> PyDataFusionResult { // TODO: expose RecursionUnnestOptions // REF: https://github.com/apache/datafusion/pull/11577 let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls); @@ -428,7 +437,7 @@ impl PyDataFrame { } /// Calculate the intersection of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema - fn intersect(&self, py_df: PyDataFrame) -> PyResult { + fn intersect(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self .df .as_ref() @@ -438,13 +447,13 @@ impl PyDataFrame { } /// Calculate the exception of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema - fn except_all(&self, py_df: PyDataFrame) -> PyResult { + fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?; Ok(Self::new(new_df)) } /// Write a `DataFrame` to a CSV file. - fn write_csv(&self, path: &str, with_header: bool, py: Python) -> PyResult<()> { + fn write_csv(&self, path: &str, with_header: bool, py: Python) -> PyDataFusionResult<()> { let csv_options = CsvOptions { has_header: Some(with_header), ..Default::default() @@ -472,7 +481,7 @@ impl PyDataFrame { compression: &str, compression_level: Option, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { fn verify_compression_level(cl: Option) -> Result { cl.ok_or(PyValueError::new_err("compression_level is not defined")) } @@ -496,7 +505,7 @@ impl PyDataFrame { "lz4_raw" => Compression::LZ4_RAW, "uncompressed" => Compression::UNCOMPRESSED, _ => { - return Err(PyValueError::new_err(format!( + return Err(PyDataFusionError::Common(format!( "Unrecognized compression type {compression}" ))); } @@ -522,7 +531,7 @@ impl PyDataFrame { } /// Executes a query and writes the results to a partitioned JSON file. - fn write_json(&self, path: &str, py: Python) -> PyResult<()> { + fn write_json(&self, path: &str, py: Python) -> PyDataFusionResult<()> { wait_for_future( py, self.df @@ -551,7 +560,7 @@ impl PyDataFrame { &'py mut self, py: Python<'py>, requested_schema: Option>, - ) -> PyResult> { + ) -> PyDataFusionResult> { let mut batches = wait_for_future(py, self.df.as_ref().clone().collect())?; let mut schema: Schema = self.df.schema().to_owned().into(); @@ -559,15 +568,14 @@ impl PyDataFrame { validate_pycapsule(&schema_capsule, "arrow_schema")?; let schema_ptr = unsafe { schema_capsule.reference::() }; - let desired_schema = Schema::try_from(schema_ptr).map_err(DataFusionError::from)?; + let desired_schema = Schema::try_from(schema_ptr)?; - schema = project_schema(schema, desired_schema).map_err(DataFusionError::ArrowError)?; + schema = project_schema(schema, desired_schema)?; batches = batches .into_iter() .map(|record_batch| record_batch_into_schema(record_batch, &schema)) - .collect::, ArrowError>>() - .map_err(DataFusionError::ArrowError)?; + .collect::, ArrowError>>()?; } let batches_wrapped = batches.into_iter().map(Ok); @@ -578,9 +586,10 @@ impl PyDataFrame { let ffi_stream = FFI_ArrowArrayStream::new(reader); let stream_capsule_name = CString::new("arrow_array_stream").unwrap(); PyCapsule::new_bound(py, ffi_stream, Some(stream_capsule_name)) + .map_err(PyDataFusionError::from) } - fn execute_stream(&self, py: Python) -> PyResult { + fn execute_stream(&self, py: Python) -> PyDataFusionResult { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime().0; let df = self.df.as_ref().clone(); @@ -647,13 +656,13 @@ impl PyDataFrame { } // Executes this DataFrame to get the total number of rows. - fn count(&self, py: Python) -> PyResult { + fn count(&self, py: Python) -> PyDataFusionResult { Ok(wait_for_future(py, self.df.as_ref().clone().count())?) } } /// Print DataFrame -fn print_dataframe(py: Python, df: DataFrame) -> PyResult<()> { +fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> { // Get string representation of record batches let batches = wait_for_future(py, df.collect())?; let batches_as_string = pretty::pretty_format_batches(&batches); diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index 9d2559429..ace42115b 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -42,7 +42,7 @@ use datafusion::physical_plan::{ SendableRecordBatchStream, Statistics, }; -use crate::errors::DataFusionError; +use crate::errors::PyDataFusionResult; use crate::pyarrow_filter_expression::PyArrowFilterExpression; struct PyArrowBatchesAdapter { @@ -83,8 +83,8 @@ impl DatasetExec { dataset: &Bound<'_, PyAny>, projection: Option>, filters: &[Expr], - ) -> Result { - let columns: Option, DataFusionError>> = projection.map(|p| { + ) -> PyDataFusionResult { + let columns: Option>> = projection.map(|p| { p.iter() .map(|index| { let name: String = dataset diff --git a/src/errors.rs b/src/errors.rs index d12b6ade1..b02b754a2 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -24,10 +24,10 @@ use datafusion::error::DataFusionError as InnerDataFusionError; use prost::EncodeError; use pyo3::{exceptions::PyException, PyErr}; -pub type Result = std::result::Result; +pub type PyDataFusionResult = std::result::Result; #[derive(Debug)] -pub enum DataFusionError { +pub enum PyDataFusionError { ExecutionError(InnerDataFusionError), ArrowError(ArrowError), Common(String), @@ -35,46 +35,46 @@ pub enum DataFusionError { EncodeError(EncodeError), } -impl fmt::Display for DataFusionError { +impl fmt::Display for PyDataFusionError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - DataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {e:?}"), - DataFusionError::ArrowError(e) => write!(f, "Arrow error: {e:?}"), - DataFusionError::PythonError(e) => write!(f, "Python error {e:?}"), - DataFusionError::Common(e) => write!(f, "{e}"), - DataFusionError::EncodeError(e) => write!(f, "Failed to encode substrait plan: {e}"), + PyDataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {e:?}"), + PyDataFusionError::ArrowError(e) => write!(f, "Arrow error: {e:?}"), + PyDataFusionError::PythonError(e) => write!(f, "Python error {e:?}"), + PyDataFusionError::Common(e) => write!(f, "{e}"), + PyDataFusionError::EncodeError(e) => write!(f, "Failed to encode substrait plan: {e}"), } } } -impl From for DataFusionError { - fn from(err: ArrowError) -> DataFusionError { - DataFusionError::ArrowError(err) +impl From for PyDataFusionError { + fn from(err: ArrowError) -> PyDataFusionError { + PyDataFusionError::ArrowError(err) } } -impl From for DataFusionError { - fn from(err: InnerDataFusionError) -> DataFusionError { - DataFusionError::ExecutionError(err) +impl From for PyDataFusionError { + fn from(err: InnerDataFusionError) -> PyDataFusionError { + PyDataFusionError::ExecutionError(err) } } -impl From for DataFusionError { - fn from(err: PyErr) -> DataFusionError { - DataFusionError::PythonError(err) +impl From for PyDataFusionError { + fn from(err: PyErr) -> PyDataFusionError { + PyDataFusionError::PythonError(err) } } -impl From for PyErr { - fn from(err: DataFusionError) -> PyErr { +impl From for PyErr { + fn from(err: PyDataFusionError) -> PyErr { match err { - DataFusionError::PythonError(py_err) => py_err, + PyDataFusionError::PythonError(py_err) => py_err, _ => PyException::new_err(err.to_string()), } } } -impl Error for DataFusionError {} +impl Error for PyDataFusionError {} pub fn py_type_err(e: impl Debug) -> PyErr { PyErr::new::(format!("{e:?}")) diff --git a/src/expr.rs b/src/expr.rs index bca0cd3fa..1e9983d42 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -24,7 +24,6 @@ use std::convert::{From, Into}; use std::sync::Arc; use window::PyWindowFrame; -use arrow::pyarrow::ToPyArrow; use datafusion::arrow::datatypes::{DataType, Field}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::functions::core::expr_ext::FieldAccessor; @@ -33,15 +32,17 @@ use datafusion::logical_expr::{ expr::{AggregateFunction, InList, InSubquery, ScalarFunction, WindowFunction}, lit, Between, BinaryExpr, Case, Cast, Expr, Like, Operator, TryCast, }; -use datafusion::scalar::ScalarValue; -use crate::common::data_type::{DataTypeMap, NullTreatment, RexType}; -use crate::errors::{py_runtime_err, py_type_err, py_unsupported_variant_err, DataFusionError}; +use crate::common::data_type::{DataTypeMap, NullTreatment, PyScalarValue, RexType}; +use crate::errors::{ + py_runtime_err, py_type_err, py_unsupported_variant_err, PyDataFusionError, PyDataFusionResult, +}; use crate::expr::aggregate_expr::PyAggregateFunction; use crate::expr::binary_expr::PyBinaryExpr; use crate::expr::column::PyColumn; use crate::expr::literal::PyLiteral; use crate::functions::add_builder_fns_to_window; +use crate::pyarrow_util::scalar_to_pyarrow; use crate::sql::logical::PyLogicalPlan; use self::alias::PyAlias; @@ -261,8 +262,8 @@ impl PyExpr { } #[staticmethod] - pub fn literal(value: ScalarValue) -> PyExpr { - lit(value).into() + pub fn literal(value: PyScalarValue) -> PyExpr { + lit(value.0).into() } #[staticmethod] @@ -356,7 +357,7 @@ impl PyExpr { /// Extracts the Expr value into a PyObject that can be shared with Python pub fn python_value(&self, py: Python) -> PyResult { match &self.expr { - Expr::Literal(scalar_value) => Ok(scalar_value.to_pyarrow(py)?), + Expr::Literal(scalar_value) => scalar_to_pyarrow(scalar_value, py), _ => Err(py_type_err(format!( "Non Expr::Literal encountered in types: {:?}", &self.expr @@ -568,7 +569,7 @@ impl PyExpr { window_frame: Option, order_by: Option>, null_treatment: Option, - ) -> PyResult { + ) -> PyDataFusionResult { match &self.expr { Expr::AggregateFunction(agg_fn) => { let window_fn = Expr::WindowFunction(WindowFunction::new( @@ -592,10 +593,9 @@ impl PyExpr { null_treatment, ), _ => Err( - DataFusionError::ExecutionError(datafusion::error::DataFusionError::Plan( + PyDataFusionError::ExecutionError(datafusion::error::DataFusionError::Plan( format!("Using {} with `over` is not allowed. Must use an aggregate or window function.", self.expr.variant_name()), )) - .into(), ), } } @@ -649,34 +649,26 @@ impl PyExprFuncBuilder { .into() } - pub fn build(&self) -> PyResult { - self.builder - .clone() - .build() - .map(|expr| expr.into()) - .map_err(|err| err.into()) + pub fn build(&self) -> PyDataFusionResult { + Ok(self.builder.clone().build().map(|expr| expr.into())?) } } impl PyExpr { - pub fn _column_name(&self, plan: &LogicalPlan) -> Result { + pub fn _column_name(&self, plan: &LogicalPlan) -> PyDataFusionResult { let field = Self::expr_to_field(&self.expr, plan)?; Ok(field.name().to_owned()) } /// Create a [Field] representing an [Expr], given an input [LogicalPlan] to resolve against - pub fn expr_to_field( - expr: &Expr, - input_plan: &LogicalPlan, - ) -> Result, DataFusionError> { + pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> PyDataFusionResult> { match expr { Expr::Wildcard { .. } => { // Since * could be any of the valid column names just return the first one Ok(Arc::new(input_plan.schema().field(0).clone())) } _ => { - let fields = - exprlist_to_fields(&[expr.clone()], input_plan).map_err(PyErr::from)?; + let fields = exprlist_to_fields(&[expr.clone()], input_plan)?; Ok(fields[0].1.clone()) } } diff --git a/src/expr/conditional_expr.rs b/src/expr/conditional_expr.rs index a8a885c54..fe3af2e25 100644 --- a/src/expr/conditional_expr.rs +++ b/src/expr/conditional_expr.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; +use crate::{errors::PyDataFusionResult, expr::PyExpr}; use datafusion::logical_expr::conditional_expressions::CaseBuilder; use pyo3::prelude::*; @@ -44,11 +44,11 @@ impl PyCaseBuilder { } } - fn otherwise(&mut self, else_expr: PyExpr) -> PyResult { + fn otherwise(&mut self, else_expr: PyExpr) -> PyDataFusionResult { Ok(self.case_builder.otherwise(else_expr.expr)?.clone().into()) } - fn end(&mut self) -> PyResult { + fn end(&mut self) -> PyDataFusionResult { Ok(self.case_builder.end()?.clone().into()) } } diff --git a/src/expr/literal.rs b/src/expr/literal.rs index 43084ba4b..2cb2079f1 100644 --- a/src/expr/literal.rs +++ b/src/expr/literal.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::errors::DataFusionError; +use crate::errors::PyDataFusionError; use datafusion::common::ScalarValue; use pyo3::prelude::*; @@ -154,5 +154,5 @@ impl PyLiteral { } fn unexpected_literal_value(value: &ScalarValue) -> PyErr { - DataFusionError::Common(format!("getValue() - Unexpected value: {value}")).into() + PyDataFusionError::Common(format!("getValue() - Unexpected value: {value}")).into() } diff --git a/src/expr/window.rs b/src/expr/window.rs index 6486dbb32..4dc6cb9c9 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -21,8 +21,9 @@ use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, Wind use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; +use crate::common::data_type::PyScalarValue; use crate::common::df_schema::PyDFSchema; -use crate::errors::py_type_err; +use crate::errors::{py_type_err, PyDataFusionResult}; use crate::expr::logical_node::LogicalNode; use crate::expr::sort_expr::{py_sort_expr_list, PySortExpr}; use crate::expr::PyExpr; @@ -171,8 +172,8 @@ impl PyWindowFrame { #[pyo3(signature=(unit, start_bound, end_bound))] pub fn new( unit: &str, - start_bound: Option, - end_bound: Option, + start_bound: Option, + end_bound: Option, ) -> PyResult { let units = unit.to_ascii_lowercase(); let units = match units.as_str() { @@ -187,7 +188,7 @@ impl PyWindowFrame { } }; let start_bound = match start_bound { - Some(start_bound) => WindowFrameBound::Preceding(start_bound), + Some(start_bound) => WindowFrameBound::Preceding(start_bound.0), None => match units { WindowFrameUnits::Range => WindowFrameBound::Preceding(ScalarValue::UInt64(None)), WindowFrameUnits::Rows => WindowFrameBound::Preceding(ScalarValue::UInt64(None)), @@ -200,7 +201,7 @@ impl PyWindowFrame { }, }; let end_bound = match end_bound { - Some(end_bound) => WindowFrameBound::Following(end_bound), + Some(end_bound) => WindowFrameBound::Following(end_bound.0), None => match units { WindowFrameUnits::Rows => WindowFrameBound::Following(ScalarValue::UInt64(None)), WindowFrameUnits::Range => WindowFrameBound::Following(ScalarValue::UInt64(None)), @@ -253,7 +254,7 @@ impl PyWindowFrameBound { matches!(self.frame_bound, WindowFrameBound::Following(_)) } /// Returns the offset of the window frame - pub fn get_offset(&self) -> PyResult> { + pub fn get_offset(&self) -> PyDataFusionResult> { match &self.frame_bound { WindowFrameBound::Preceding(val) | WindowFrameBound::Following(val) => match val { x if x.is_null() => Ok(None), diff --git a/src/functions.rs b/src/functions.rs index ae032d702..46c748cf8 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -22,8 +22,10 @@ use datafusion::logical_expr::WindowFrame; use pyo3::{prelude::*, wrap_pyfunction}; use crate::common::data_type::NullTreatment; +use crate::common::data_type::PyScalarValue; use crate::context::PySessionContext; -use crate::errors::DataFusionError; +use crate::errors::PyDataFusionError; +use crate::errors::PyDataFusionResult; use crate::expr::conditional_expr::PyCaseBuilder; use crate::expr::sort_expr::to_sort_expressions; use crate::expr::sort_expr::PySortExpr; @@ -44,7 +46,7 @@ fn add_builder_fns_to_aggregate( filter: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { // Since ExprFuncBuilder::new() is private, we can guarantee initializing // a builder with an `null_treatment` with option None let mut builder = agg_fn.null_treatment(None); @@ -228,7 +230,10 @@ fn when(when: PyExpr, then: PyExpr) -> PyResult { /// 1) If no function has been found, search default aggregate functions. /// /// NOTE: we search the built-ins first because the `UDAF` versions currently do not have the same behavior. -fn find_window_fn(name: &str, ctx: Option) -> PyResult { +fn find_window_fn( + name: &str, + ctx: Option, +) -> PyDataFusionResult { if let Some(ctx) = ctx { // search UDAFs let udaf = ctx @@ -284,7 +289,9 @@ fn find_window_fn(name: &str, ctx: Option) -> PyResult, order_by: Option>, null_treatment: Option - ) -> PyResult { + ) -> PyDataFusionResult { let agg_fn = functions_aggregate::expr_fn::$NAME($($arg.into()),*); add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) @@ -362,7 +369,7 @@ macro_rules! aggregate_function_vec_args { filter: Option, order_by: Option>, null_treatment: Option - ) -> PyResult { + ) -> PyDataFusionResult { let agg_fn = functions_aggregate::expr_fn::$NAME(vec![$($arg.into()),*]); add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) @@ -642,7 +649,7 @@ pub fn approx_percentile_cont( percentile: f64, num_centroids: Option, // enforces optional arguments at the end, currently filter: Option, -) -> PyResult { +) -> PyDataFusionResult { let args = if let Some(num_centroids) = num_centroids { vec![expression.expr, lit(percentile), lit(num_centroids)] } else { @@ -661,7 +668,7 @@ pub fn approx_percentile_cont_with_weight( weight: PyExpr, percentile: f64, filter: Option, -) -> PyResult { +) -> PyDataFusionResult { let agg_fn = functions_aggregate::expr_fn::approx_percentile_cont_with_weight( expression.expr, weight.expr, @@ -683,7 +690,7 @@ pub fn first_value( filter: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { // If we initialize the UDAF with order_by directly, then it gets over-written by the builder let agg_fn = functions_aggregate::expr_fn::first_value(expr.expr, None); @@ -700,7 +707,7 @@ pub fn nth_value( filter: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { let agg_fn = datafusion::functions_aggregate::nth_value::nth_value(expr.expr, n, vec![]); add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) } @@ -715,7 +722,7 @@ pub fn string_agg( filter: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { let agg_fn = datafusion::functions_aggregate::string_agg::string_agg(expr.expr, lit(delimiter)); add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) } @@ -726,7 +733,7 @@ pub(crate) fn add_builder_fns_to_window( window_frame: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { let null_treatment = null_treatment.map(|n| n.into()); let mut builder = window_fn.null_treatment(null_treatment); @@ -748,7 +755,7 @@ pub(crate) fn add_builder_fns_to_window( builder = builder.window_frame(window_frame.into()); } - builder.build().map(|e| e.into()).map_err(|err| err.into()) + Ok(builder.build().map(|e| e.into())?) } #[pyfunction] @@ -756,10 +763,11 @@ pub(crate) fn add_builder_fns_to_window( pub fn lead( arg: PyExpr, shift_offset: i64, - default_value: Option, + default_value: Option, partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { + let default_value = default_value.map(|v| v.into()); let window_fn = functions_window::expr_fn::lead(arg.expr, Some(shift_offset), default_value); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -770,10 +778,11 @@ pub fn lead( pub fn lag( arg: PyExpr, shift_offset: i64, - default_value: Option, + default_value: Option, partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { + let default_value = default_value.map(|v| v.into()); let window_fn = functions_window::expr_fn::lag(arg.expr, Some(shift_offset), default_value); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -784,7 +793,7 @@ pub fn lag( pub fn row_number( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::row_number(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -795,7 +804,7 @@ pub fn row_number( pub fn rank( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::rank(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -806,7 +815,7 @@ pub fn rank( pub fn dense_rank( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::dense_rank(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -817,7 +826,7 @@ pub fn dense_rank( pub fn percent_rank( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::percent_rank(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -828,7 +837,7 @@ pub fn percent_rank( pub fn cume_dist( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::cume_dist(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -840,7 +849,7 @@ pub fn ntile( arg: PyExpr, partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::ntile(arg.into()); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) diff --git a/src/lib.rs b/src/lib.rs index 1111d5d06..317c3a49a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,6 +48,7 @@ pub mod expr; mod functions; pub mod physical_plan; mod pyarrow_filter_expression; +pub mod pyarrow_util; mod record_batch; pub mod sql; pub mod store; diff --git a/src/physical_plan.rs b/src/physical_plan.rs index 9ef2f0ebb..295908dc7 100644 --- a/src/physical_plan.rs +++ b/src/physical_plan.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; -use crate::{context::PySessionContext, errors::DataFusionError}; +use crate::{context::PySessionContext, errors::PyDataFusionResult}; #[pyclass(name = "ExecutionPlan", module = "datafusion", subclass)] #[derive(Debug, Clone)] @@ -58,7 +58,7 @@ impl PyExecutionPlan { format!("{}", d.indent(false)) } - pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyResult> { + pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { let codec = DefaultPhysicalExtensionCodec {}; let proto = datafusion_proto::protobuf::PhysicalPlanNode::try_from_physical_plan( self.plan.clone(), @@ -70,7 +70,10 @@ impl PyExecutionPlan { } #[staticmethod] - pub fn from_proto(ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>) -> PyResult { + pub fn from_proto( + ctx: PySessionContext, + proto_msg: Bound<'_, PyBytes>, + ) -> PyDataFusionResult { let bytes: &[u8] = proto_msg.extract()?; let proto_plan = datafusion_proto::protobuf::PhysicalPlanNode::decode(bytes).map_err(|e| { @@ -81,9 +84,7 @@ impl PyExecutionPlan { })?; let codec = DefaultPhysicalExtensionCodec {}; - let plan = proto_plan - .try_into_physical_plan(&ctx.ctx, &ctx.ctx.runtime_env(), &codec) - .map_err(DataFusionError::from)?; + let plan = proto_plan.try_into_physical_plan(&ctx.ctx, &ctx.ctx.runtime_env(), &codec)?; Ok(Self::new(plan)) } diff --git a/src/pyarrow_filter_expression.rs b/src/pyarrow_filter_expression.rs index 0f97ea442..314eebf4f 100644 --- a/src/pyarrow_filter_expression.rs +++ b/src/pyarrow_filter_expression.rs @@ -21,11 +21,11 @@ use pyo3::prelude::*; use std::convert::TryFrom; use std::result::Result; -use arrow::pyarrow::ToPyArrow; use datafusion::common::{Column, ScalarValue}; use datafusion::logical_expr::{expr::InList, Between, BinaryExpr, Expr, Operator}; -use crate::errors::DataFusionError; +use crate::errors::{PyDataFusionError, PyDataFusionResult}; +use crate::pyarrow_util::scalar_to_pyarrow; #[derive(Debug)] #[repr(transparent)] @@ -34,7 +34,7 @@ pub(crate) struct PyArrowFilterExpression(PyObject); fn operator_to_py<'py>( operator: &Operator, op: &Bound<'py, PyModule>, -) -> Result, DataFusionError> { +) -> PyDataFusionResult> { let py_op: Bound<'_, PyAny> = match operator { Operator::Eq => op.getattr("eq")?, Operator::NotEq => op.getattr("ne")?, @@ -45,7 +45,7 @@ fn operator_to_py<'py>( Operator::And => op.getattr("and_")?, Operator::Or => op.getattr("or_")?, _ => { - return Err(DataFusionError::Common(format!( + return Err(PyDataFusionError::Common(format!( "Unsupported operator {operator:?}" ))) } @@ -53,8 +53,8 @@ fn operator_to_py<'py>( Ok(py_op) } -fn extract_scalar_list(exprs: &[Expr], py: Python) -> Result, DataFusionError> { - let ret: Result, DataFusionError> = exprs +fn extract_scalar_list(exprs: &[Expr], py: Python) -> PyDataFusionResult> { + let ret = exprs .iter() .map(|expr| match expr { // TODO: should we also leverage `ScalarValue::to_pyarrow` here? @@ -71,11 +71,11 @@ fn extract_scalar_list(exprs: &[Expr], py: Python) -> Result, Data ScalarValue::Float32(Some(f)) => Ok(f.into_py(py)), ScalarValue::Float64(Some(f)) => Ok(f.into_py(py)), ScalarValue::Utf8(Some(s)) => Ok(s.into_py(py)), - _ => Err(DataFusionError::Common(format!( + _ => Err(PyDataFusionError::Common(format!( "PyArrow can't handle ScalarValue: {v:?}" ))), }, - _ => Err(DataFusionError::Common(format!( + _ => Err(PyDataFusionError::Common(format!( "Only a list of Literals are supported got {expr:?}" ))), }) @@ -90,7 +90,7 @@ impl PyArrowFilterExpression { } impl TryFrom<&Expr> for PyArrowFilterExpression { - type Error = DataFusionError; + type Error = PyDataFusionError; // Converts a Datafusion filter Expr into an expression string that can be evaluated by Python // Note that pyarrow.compute.{field,scalar} are put into Python globals() when evaluated @@ -100,9 +100,9 @@ impl TryFrom<&Expr> for PyArrowFilterExpression { Python::with_gil(|py| { let pc = Python::import_bound(py, "pyarrow.compute")?; let op_module = Python::import_bound(py, "operator")?; - let pc_expr: Result, DataFusionError> = match expr { + let pc_expr: PyDataFusionResult> = match expr { Expr::Column(Column { name, .. }) => Ok(pc.getattr("field")?.call1((name,))?), - Expr::Literal(scalar) => Ok(scalar.to_pyarrow(py)?.into_bound(py)), + Expr::Literal(scalar) => Ok(scalar_to_pyarrow(scalar, py)?.into_bound(py)), Expr::BinaryExpr(BinaryExpr { left, op, right }) => { let operator = operator_to_py(op, &op_module)?; let left = PyArrowFilterExpression::try_from(left.as_ref())?.0; @@ -167,7 +167,7 @@ impl TryFrom<&Expr> for PyArrowFilterExpression { Ok(if *negated { invert.call1((ret,))? } else { ret }) } - _ => Err(DataFusionError::Common(format!( + _ => Err(PyDataFusionError::Common(format!( "Unsupported Datafusion expression {expr:?}" ))), }; diff --git a/src/pyarrow_util.rs b/src/pyarrow_util.rs new file mode 100644 index 000000000..2b31467f8 --- /dev/null +++ b/src/pyarrow_util.rs @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Conversions between PyArrow and DataFusion types + +use arrow::array::{Array, ArrayData}; +use arrow::pyarrow::{FromPyArrow, ToPyArrow}; +use datafusion::scalar::ScalarValue; +use pyo3::types::{PyAnyMethods, PyList}; +use pyo3::{Bound, FromPyObject, PyAny, PyObject, PyResult, Python}; + +use crate::common::data_type::PyScalarValue; +use crate::errors::PyDataFusionError; + +impl FromPyArrow for PyScalarValue { + fn from_pyarrow_bound(value: &Bound<'_, PyAny>) -> PyResult { + let py = value.py(); + let typ = value.getattr("type")?; + let val = value.call_method0("as_py")?; + + // construct pyarrow array from the python value and pyarrow type + let factory = py.import_bound("pyarrow")?.getattr("array")?; + let args = PyList::new_bound(py, [val]); + let array = factory.call1((args, typ))?; + + // convert the pyarrow array to rust array using C data interface + let array = arrow::array::make_array(ArrayData::from_pyarrow_bound(&array)?); + let scalar = ScalarValue::try_from_array(&array, 0).map_err(PyDataFusionError::from)?; + + Ok(PyScalarValue(scalar)) + } +} + +impl<'source> FromPyObject<'source> for PyScalarValue { + fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult { + Self::from_pyarrow_bound(value) + } +} + +pub fn scalar_to_pyarrow(scalar: &ScalarValue, py: Python) -> PyResult { + let array = scalar.to_array().map_err(PyDataFusionError::from)?; + // convert to pyarrow array using C data interface + let pyarray = array.to_data().to_pyarrow(py)?; + let pyscalar = pyarray.call_method1(py, "__getitem__", (0,))?; + + Ok(pyscalar) +} diff --git a/src/record_batch.rs b/src/record_batch.rs index eacdb5867..ec61c263f 100644 --- a/src/record_batch.rs +++ b/src/record_batch.rs @@ -17,6 +17,7 @@ use std::sync::Arc; +use crate::errors::PyDataFusionError; use crate::utils::wait_for_future; use datafusion::arrow::pyarrow::ToPyArrow; use datafusion::arrow::record_batch::RecordBatch; @@ -90,7 +91,7 @@ async fn next_stream( let mut stream = stream.lock().await; match stream.next().await { Some(Ok(batch)) => Ok(batch.into()), - Some(Err(e)) => Err(e.into()), + Some(Err(e)) => Err(PyDataFusionError::from(e))?, None => { // Depending on whether the iteration is sync or not, we raise either a // StopIteration or a StopAsyncIteration diff --git a/src/sql/exceptions.rs b/src/sql/exceptions.rs index c458402a0..cfb02274b 100644 --- a/src/sql/exceptions.rs +++ b/src/sql/exceptions.rs @@ -17,13 +17,7 @@ use std::fmt::{Debug, Display}; -use pyo3::{create_exception, PyErr}; - -// Identifies exceptions that occur while attempting to generate a `LogicalPlan` from a SQL string -create_exception!(rust, ParsingException, pyo3::exceptions::PyException); - -// Identifies exceptions that occur during attempts to optimization an existing `LogicalPlan` -create_exception!(rust, OptimizationException, pyo3::exceptions::PyException); +use pyo3::PyErr; pub fn py_type_err(e: impl Debug + Display) -> PyErr { PyErr::new::(format!("{e}")) @@ -33,10 +27,6 @@ pub fn py_runtime_err(e: impl Debug + Display) -> PyErr { PyErr::new::(format!("{e}")) } -pub fn py_parsing_exp(e: impl Debug + Display) -> PyErr { - PyErr::new::(format!("{e}")) -} - -pub fn py_optimization_exp(e: impl Debug + Display) -> PyErr { - PyErr::new::(format!("{e}")) +pub fn py_value_err(e: impl Debug + Display) -> PyErr { + PyErr::new::(format!("{e}")) } diff --git a/src/sql/logical.rs b/src/sql/logical.rs index a541889c7..1be33b75f 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -17,6 +17,7 @@ use std::sync::Arc; +use crate::errors::PyDataFusionResult; use crate::expr::aggregate::PyAggregate; use crate::expr::analyze::PyAnalyze; use crate::expr::distinct::PyDistinct; @@ -34,7 +35,7 @@ use crate::expr::table_scan::PyTableScan; use crate::expr::unnest::PyUnnest; use crate::expr::window::PyWindowExpr; use crate::{context::PySessionContext, errors::py_unsupported_variant_err}; -use datafusion::{error::DataFusionError, logical_expr::LogicalPlan}; +use datafusion::logical_expr::LogicalPlan; use datafusion_proto::logical_plan::{AsLogicalPlan, DefaultLogicalExtensionCodec}; use prost::Message; use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; @@ -125,7 +126,7 @@ impl PyLogicalPlan { format!("{}", self.plan.display_graphviz()) } - pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyResult> { + pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { let codec = DefaultLogicalExtensionCodec {}; let proto = datafusion_proto::protobuf::LogicalPlanNode::try_from_logical_plan(&self.plan, &codec)?; @@ -135,7 +136,10 @@ impl PyLogicalPlan { } #[staticmethod] - pub fn from_proto(ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>) -> PyResult { + pub fn from_proto( + ctx: PySessionContext, + proto_msg: Bound<'_, PyBytes>, + ) -> PyDataFusionResult { let bytes: &[u8] = proto_msg.extract()?; let proto_plan = datafusion_proto::protobuf::LogicalPlanNode::decode(bytes).map_err(|e| { @@ -146,9 +150,7 @@ impl PyLogicalPlan { })?; let codec = DefaultLogicalExtensionCodec {}; - let plan = proto_plan - .try_into_logical_plan(&ctx.ctx, &codec) - .map_err(DataFusionError::from)?; + let plan = proto_plan.try_into_logical_plan(&ctx.ctx, &codec)?; Ok(Self::new(plan)) } } diff --git a/src/substrait.rs b/src/substrait.rs index 16e8c9507..8dcf3e8a7 100644 --- a/src/substrait.rs +++ b/src/substrait.rs @@ -18,7 +18,7 @@ use pyo3::{prelude::*, types::PyBytes}; use crate::context::PySessionContext; -use crate::errors::{py_datafusion_err, DataFusionError}; +use crate::errors::{py_datafusion_err, PyDataFusionError, PyDataFusionResult}; use crate::sql::logical::PyLogicalPlan; use crate::utils::wait_for_future; @@ -39,7 +39,7 @@ impl PyPlan { let mut proto_bytes = Vec::::new(); self.plan .encode(&mut proto_bytes) - .map_err(DataFusionError::EncodeError)?; + .map_err(PyDataFusionError::EncodeError)?; Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) } } @@ -66,41 +66,47 @@ pub struct PySubstraitSerializer; #[pymethods] impl PySubstraitSerializer { #[staticmethod] - pub fn serialize(sql: &str, ctx: PySessionContext, path: &str, py: Python) -> PyResult<()> { - wait_for_future(py, serializer::serialize(sql, &ctx.ctx, path)) - .map_err(DataFusionError::from)?; + pub fn serialize( + sql: &str, + ctx: PySessionContext, + path: &str, + py: Python, + ) -> PyDataFusionResult<()> { + wait_for_future(py, serializer::serialize(sql, &ctx.ctx, path))?; Ok(()) } #[staticmethod] - pub fn serialize_to_plan(sql: &str, ctx: PySessionContext, py: Python) -> PyResult { - match PySubstraitSerializer::serialize_bytes(sql, ctx, py) { - Ok(proto_bytes) => { - let proto_bytes = proto_bytes.bind(py).downcast::().unwrap(); - PySubstraitSerializer::deserialize_bytes(proto_bytes.as_bytes().to_vec(), py) - } - Err(e) => Err(py_datafusion_err(e)), - } + pub fn serialize_to_plan( + sql: &str, + ctx: PySessionContext, + py: Python, + ) -> PyDataFusionResult { + PySubstraitSerializer::serialize_bytes(sql, ctx, py).and_then(|proto_bytes| { + let proto_bytes = proto_bytes.bind(py).downcast::().unwrap(); + PySubstraitSerializer::deserialize_bytes(proto_bytes.as_bytes().to_vec(), py) + }) } #[staticmethod] - pub fn serialize_bytes(sql: &str, ctx: PySessionContext, py: Python) -> PyResult { - let proto_bytes: Vec = wait_for_future(py, serializer::serialize_bytes(sql, &ctx.ctx)) - .map_err(DataFusionError::from)?; + pub fn serialize_bytes( + sql: &str, + ctx: PySessionContext, + py: Python, + ) -> PyDataFusionResult { + let proto_bytes: Vec = wait_for_future(py, serializer::serialize_bytes(sql, &ctx.ctx))?; Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) } #[staticmethod] - pub fn deserialize(path: &str, py: Python) -> PyResult { - let plan = - wait_for_future(py, serializer::deserialize(path)).map_err(DataFusionError::from)?; + pub fn deserialize(path: &str, py: Python) -> PyDataFusionResult { + let plan = wait_for_future(py, serializer::deserialize(path))?; Ok(PyPlan { plan: *plan }) } #[staticmethod] - pub fn deserialize_bytes(proto_bytes: Vec, py: Python) -> PyResult { - let plan = wait_for_future(py, serializer::deserialize_bytes(proto_bytes)) - .map_err(DataFusionError::from)?; + pub fn deserialize_bytes(proto_bytes: Vec, py: Python) -> PyDataFusionResult { + let plan = wait_for_future(py, serializer::deserialize_bytes(proto_bytes))?; Ok(PyPlan { plan: *plan }) } } @@ -134,10 +140,10 @@ impl PySubstraitConsumer { ctx: &mut PySessionContext, plan: PyPlan, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let session_state = ctx.ctx.state(); let result = consumer::from_substrait_plan(&session_state, &plan.plan); - let logical_plan = wait_for_future(py, result).map_err(DataFusionError::from)?; + let logical_plan = wait_for_future(py, result)?; Ok(PyLogicalPlan::new(logical_plan)) } } diff --git a/src/udaf.rs b/src/udaf.rs index a6aa59ac3..5f21533e0 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -28,6 +28,7 @@ use datafusion::logical_expr::{ create_udaf, Accumulator, AccumulatorFactoryFunction, AggregateUDF, }; +use crate::common::data_type::PyScalarValue; use crate::expr::PyExpr; use crate::utils::parse_volatility; @@ -44,13 +45,25 @@ impl RustAccumulator { impl Accumulator for RustAccumulator { fn state(&mut self) -> Result> { - Python::with_gil(|py| self.accum.bind(py).call_method0("state")?.extract()) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) + Python::with_gil(|py| { + self.accum + .bind(py) + .call_method0("state")? + .extract::>() + }) + .map(|v| v.into_iter().map(|x| x.0).collect()) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) } fn evaluate(&mut self) -> Result { - Python::with_gil(|py| self.accum.bind(py).call_method0("evaluate")?.extract()) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) + Python::with_gil(|py| { + self.accum + .bind(py) + .call_method0("evaluate")? + .extract::() + }) + .map(|v| v.0) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) } fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { diff --git a/src/udwf.rs b/src/udwf.rs index 689eb79e3..04a4a1640 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -26,6 +26,7 @@ use datafusion::scalar::ScalarValue; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; +use crate::common::data_type::PyScalarValue; use crate::expr::PyExpr; use crate::utils::parse_volatility; use datafusion::arrow::datatypes::DataType; @@ -133,7 +134,8 @@ impl PartitionEvaluator for RustPartitionEvaluator { self.evaluator .bind(py) .call_method1("evaluate", py_args) - .and_then(|v| v.extract()) + .and_then(|v| v.extract::()) + .map(|v| v.0) .map_err(|e| DataFusionError::Execution(format!("{e}"))) }) } diff --git a/src/utils.rs b/src/utils.rs index 795589752..ed224b364 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::errors::DataFusionError; +use crate::errors::{PyDataFusionError, PyDataFusionResult}; use crate::TokioRuntime; use datafusion::logical_expr::Volatility; use pyo3::exceptions::PyValueError; @@ -47,13 +47,13 @@ where py.allow_threads(|| runtime.block_on(f)) } -pub(crate) fn parse_volatility(value: &str) -> Result { +pub(crate) fn parse_volatility(value: &str) -> PyDataFusionResult { Ok(match value { "immutable" => Volatility::Immutable, "stable" => Volatility::Stable, "volatile" => Volatility::Volatile, value => { - return Err(DataFusionError::Common(format!( + return Err(PyDataFusionError::Common(format!( "Unsupportad volatility type: `{value}`, supported \ values are: immutable, stable and volatile." ))) From d3c4dabe3c24d419911106bdde3dfe1244e1224c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 5 Feb 2025 09:42:03 -0500 Subject: [PATCH 094/248] Fix verify-release-candidate script by removing reference to requirements-310.txt (#1012) * Fix verify-release-candidate script by removing requirements.txt * Update dev/release/verify-release-candidate.sh Co-authored-by: Kevin Liu --------- Co-authored-by: Kevin Liu --- dev/release/verify-release-candidate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 1a9104b55..2bfce0e2d 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -128,7 +128,7 @@ test_source_distribution() { python3 -m venv .venv source .venv/bin/activate python3 -m pip install -U pip - python3 -m pip install -r requirements-310.txt + python3 -m pip install -U maturin maturin develop #TODO: we should really run tests here as well From 93ac6a820353b3ddea014be1eddad8bd004b0fce Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 7 Feb 2025 10:39:51 -0500 Subject: [PATCH 095/248] Prepare release 44.0.0 (#1009) --- Cargo.lock | 2 +- Cargo.toml | 2 +- dev/changelog/44.0.0.md | 58 ++++++++ pyproject.toml | 1 + uv.lock | 301 +++++++++++++++++++++++++++++++++++++++- 5 files changed, 361 insertions(+), 3 deletions(-) create mode 100644 dev/changelog/44.0.0.md diff --git a/Cargo.lock b/Cargo.lock index c6590fd21..50809696b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1327,7 +1327,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "43.0.0" +version = "44.0.0" dependencies = [ "arrow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 003ba36e5..44e6e2244 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "43.0.0" +version = "44.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] diff --git a/dev/changelog/44.0.0.md b/dev/changelog/44.0.0.md new file mode 100644 index 000000000..c5ed4bdb0 --- /dev/null +++ b/dev/changelog/44.0.0.md @@ -0,0 +1,58 @@ + + +# Apache DataFusion Python 44.0.0 Changelog + +This release consists of 12 commits from 5 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: support enable_url_table config [#980](https://github.com/apache/datafusion-python/pull/980) (chenkovsky) +- feat: remove DataFusion pyarrow feat [#1000](https://github.com/apache/datafusion-python/pull/1000) (timsaucer) + +**Fixed bugs:** + +- fix: correct LZ0 to LZO in compression options [#995](https://github.com/apache/datafusion-python/pull/995) (kosiew) + +**Other:** + +- Add arrow cast [#962](https://github.com/apache/datafusion-python/pull/962) (kosiew) +- Fix small issues in pyproject.toml [#976](https://github.com/apache/datafusion-python/pull/976) (kylebarron) +- chore: set validation and type hint for ffi tableprovider [#983](https://github.com/apache/datafusion-python/pull/983) (ion-elgreco) +- Support async iteration of RecordBatchStream [#975](https://github.com/apache/datafusion-python/pull/975) (kylebarron) +- Chore/upgrade datafusion 44 [#973](https://github.com/apache/datafusion-python/pull/973) (timsaucer) +- Default to ZSTD compression when writing Parquet [#981](https://github.com/apache/datafusion-python/pull/981) (kosiew) +- Feat/use uv python management [#994](https://github.com/apache/datafusion-python/pull/994) (timsaucer) +- minor: Update dependencies prior to release [#999](https://github.com/apache/datafusion-python/pull/999) (timsaucer) +- Apply import ordering in ruff check [#1001](https://github.com/apache/datafusion-python/pull/1001) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 5 Tim Saucer + 3 kosiew + 2 Kyle Barron + 1 Chongchen Chen + 1 Ion Koutsouris +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/pyproject.toml b/pyproject.toml index 32bb28d21..f416e02a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,7 @@ dev = [ "pytest>=7.4.4", "ruff>=0.9.1", "toml>=0.10.2", + "pygithub==2.5.0", ] docs = [ "sphinx>=7.1.2", diff --git a/uv.lock b/uv.lock index 75d9ed018..587ddc8b7 100644 --- a/uv.lock +++ b/uv.lock @@ -139,6 +139,83 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a5/32/8f6669fc4798494966bf446c8c4a162e0b5d893dff088afddf76414f70e1/certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56", size = 164927 }, ] +[[package]] +name = "cffi" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/07/f44ca684db4e4f08a3fdc6eeb9a0d15dc6883efc7b8c90357fdbf74e186c/cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14", size = 182191 }, + { url = "https://files.pythonhosted.org/packages/08/fd/cc2fedbd887223f9f5d170c96e57cbf655df9831a6546c1727ae13fa977a/cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67", size = 178592 }, + { url = "https://files.pythonhosted.org/packages/de/cc/4635c320081c78d6ffc2cab0a76025b691a91204f4aa317d568ff9280a2d/cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382", size = 426024 }, + { url = "https://files.pythonhosted.org/packages/b6/7b/3b2b250f3aab91abe5f8a51ada1b717935fdaec53f790ad4100fe2ec64d1/cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702", size = 448188 }, + { url = "https://files.pythonhosted.org/packages/d3/48/1b9283ebbf0ec065148d8de05d647a986c5f22586b18120020452fff8f5d/cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3", size = 455571 }, + { url = "https://files.pythonhosted.org/packages/40/87/3b8452525437b40f39ca7ff70276679772ee7e8b394934ff60e63b7b090c/cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6", size = 436687 }, + { url = "https://files.pythonhosted.org/packages/8d/fb/4da72871d177d63649ac449aec2e8a29efe0274035880c7af59101ca2232/cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17", size = 446211 }, + { url = "https://files.pythonhosted.org/packages/ab/a0/62f00bcb411332106c02b663b26f3545a9ef136f80d5df746c05878f8c4b/cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8", size = 461325 }, + { url = "https://files.pythonhosted.org/packages/36/83/76127035ed2e7e27b0787604d99da630ac3123bfb02d8e80c633f218a11d/cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e", size = 438784 }, + { url = "https://files.pythonhosted.org/packages/21/81/a6cd025db2f08ac88b901b745c163d884641909641f9b826e8cb87645942/cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be", size = 461564 }, + { url = "https://files.pythonhosted.org/packages/f8/fe/4d41c2f200c4a457933dbd98d3cf4e911870877bd94d9656cc0fcb390681/cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c", size = 171804 }, + { url = "https://files.pythonhosted.org/packages/d1/b6/0b0f5ab93b0df4acc49cae758c81fe4e5ef26c3ae2e10cc69249dfd8b3ab/cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15", size = 181299 }, + { url = "https://files.pythonhosted.org/packages/6b/f4/927e3a8899e52a27fa57a48607ff7dc91a9ebe97399b357b85a0c7892e00/cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401", size = 182264 }, + { url = "https://files.pythonhosted.org/packages/6c/f5/6c3a8efe5f503175aaddcbea6ad0d2c96dad6f5abb205750d1b3df44ef29/cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf", size = 178651 }, + { url = "https://files.pythonhosted.org/packages/94/dd/a3f0118e688d1b1a57553da23b16bdade96d2f9bcda4d32e7d2838047ff7/cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4", size = 445259 }, + { url = "https://files.pythonhosted.org/packages/2e/ea/70ce63780f096e16ce8588efe039d3c4f91deb1dc01e9c73a287939c79a6/cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41", size = 469200 }, + { url = "https://files.pythonhosted.org/packages/1c/a0/a4fa9f4f781bda074c3ddd57a572b060fa0df7655d2a4247bbe277200146/cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1", size = 477235 }, + { url = "https://files.pythonhosted.org/packages/62/12/ce8710b5b8affbcdd5c6e367217c242524ad17a02fe5beec3ee339f69f85/cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6", size = 459721 }, + { url = "https://files.pythonhosted.org/packages/ff/6b/d45873c5e0242196f042d555526f92aa9e0c32355a1be1ff8c27f077fd37/cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d", size = 467242 }, + { url = "https://files.pythonhosted.org/packages/1a/52/d9a0e523a572fbccf2955f5abe883cfa8bcc570d7faeee06336fbd50c9fc/cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6", size = 477999 }, + { url = "https://files.pythonhosted.org/packages/44/74/f2a2460684a1a2d00ca799ad880d54652841a780c4c97b87754f660c7603/cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f", size = 454242 }, + { url = "https://files.pythonhosted.org/packages/f8/4a/34599cac7dfcd888ff54e801afe06a19c17787dfd94495ab0c8d35fe99fb/cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b", size = 478604 }, + { url = "https://files.pythonhosted.org/packages/34/33/e1b8a1ba29025adbdcda5fb3a36f94c03d771c1b7b12f726ff7fef2ebe36/cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655", size = 171727 }, + { url = "https://files.pythonhosted.org/packages/3d/97/50228be003bb2802627d28ec0627837ac0bf35c90cf769812056f235b2d1/cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0", size = 181400 }, + { url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178 }, + { url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840 }, + { url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803 }, + { url = "https://files.pythonhosted.org/packages/1a/df/f8d151540d8c200eb1c6fba8cd0dfd40904f1b0682ea705c36e6c2e97ab3/cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5", size = 478850 }, + { url = "https://files.pythonhosted.org/packages/28/c0/b31116332a547fd2677ae5b78a2ef662dfc8023d67f41b2a83f7c2aa78b1/cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff", size = 485729 }, + { url = "https://files.pythonhosted.org/packages/91/2b/9a1ddfa5c7f13cab007a2c9cc295b70fbbda7cb10a286aa6810338e60ea1/cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99", size = 471256 }, + { url = "https://files.pythonhosted.org/packages/b2/d5/da47df7004cb17e4955df6a43d14b3b4ae77737dff8bf7f8f333196717bf/cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93", size = 479424 }, + { url = "https://files.pythonhosted.org/packages/0b/ac/2a28bcf513e93a219c8a4e8e125534f4f6db03e3179ba1c45e949b76212c/cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3", size = 484568 }, + { url = "https://files.pythonhosted.org/packages/d4/38/ca8a4f639065f14ae0f1d9751e70447a261f1a30fa7547a828ae08142465/cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8", size = 488736 }, + { url = "https://files.pythonhosted.org/packages/86/c5/28b2d6f799ec0bdecf44dced2ec5ed43e0eb63097b0f58c293583b406582/cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65", size = 172448 }, + { url = "https://files.pythonhosted.org/packages/50/b9/db34c4755a7bd1cb2d1603ac3863f22bcecbd1ba29e5ee841a4bc510b294/cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903", size = 181976 }, + { url = "https://files.pythonhosted.org/packages/8d/f8/dd6c246b148639254dad4d6803eb6a54e8c85c6e11ec9df2cffa87571dbe/cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e", size = 182989 }, + { url = "https://files.pythonhosted.org/packages/8b/f1/672d303ddf17c24fc83afd712316fda78dc6fce1cd53011b839483e1ecc8/cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2", size = 178802 }, + { url = "https://files.pythonhosted.org/packages/0e/2d/eab2e858a91fdff70533cab61dcff4a1f55ec60425832ddfdc9cd36bc8af/cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3", size = 454792 }, + { url = "https://files.pythonhosted.org/packages/75/b2/fbaec7c4455c604e29388d55599b99ebcc250a60050610fadde58932b7ee/cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683", size = 478893 }, + { url = "https://files.pythonhosted.org/packages/4f/b7/6e4a2162178bf1935c336d4da8a9352cccab4d3a5d7914065490f08c0690/cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5", size = 485810 }, + { url = "https://files.pythonhosted.org/packages/c7/8a/1d0e4a9c26e54746dc08c2c6c037889124d4f59dffd853a659fa545f1b40/cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4", size = 471200 }, + { url = "https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd", size = 479447 }, + { url = "https://files.pythonhosted.org/packages/5f/e4/fb8b3dd8dc0e98edf1135ff067ae070bb32ef9d509d6cb0f538cd6f7483f/cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed", size = 484358 }, + { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469 }, + { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475 }, + { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 }, + { url = "https://files.pythonhosted.org/packages/48/08/15bf6b43ae9bd06f6b00ad8a91f5a8fe1069d4c9fab550a866755402724e/cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b", size = 182457 }, + { url = "https://files.pythonhosted.org/packages/c2/5b/f1523dd545f92f7df468e5f653ffa4df30ac222f3c884e51e139878f1cb5/cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964", size = 425932 }, + { url = "https://files.pythonhosted.org/packages/53/93/7e547ab4105969cc8c93b38a667b82a835dd2cc78f3a7dad6130cfd41e1d/cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9", size = 448585 }, + { url = "https://files.pythonhosted.org/packages/56/c4/a308f2c332006206bb511de219efeff090e9d63529ba0a77aae72e82248b/cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc", size = 456268 }, + { url = "https://files.pythonhosted.org/packages/ca/5b/b63681518265f2f4060d2b60755c1c77ec89e5e045fc3773b72735ddaad5/cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c", size = 436592 }, + { url = "https://files.pythonhosted.org/packages/bb/19/b51af9f4a4faa4a8ac5a0e5d5c2522dcd9703d07fac69da34a36c4d960d3/cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1", size = 446512 }, + { url = "https://files.pythonhosted.org/packages/e2/63/2bed8323890cb613bbecda807688a31ed11a7fe7afe31f8faaae0206a9a3/cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8", size = 171576 }, + { url = "https://files.pythonhosted.org/packages/2f/70/80c33b044ebc79527447fd4fbc5455d514c3bb840dede4455de97da39b4d/cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1", size = 181229 }, + { url = "https://files.pythonhosted.org/packages/b9/ea/8bb50596b8ffbc49ddd7a1ad305035daa770202a6b782fc164647c2673ad/cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16", size = 182220 }, + { url = "https://files.pythonhosted.org/packages/ae/11/e77c8cd24f58285a82c23af484cf5b124a376b32644e445960d1a4654c3a/cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36", size = 178605 }, + { url = "https://files.pythonhosted.org/packages/ed/65/25a8dc32c53bf5b7b6c2686b42ae2ad58743f7ff644844af7cdb29b49361/cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8", size = 424910 }, + { url = "https://files.pythonhosted.org/packages/42/7a/9d086fab7c66bd7c4d0f27c57a1b6b068ced810afc498cc8c49e0088661c/cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576", size = 447200 }, + { url = "https://files.pythonhosted.org/packages/da/63/1785ced118ce92a993b0ec9e0d0ac8dc3e5dbfbcaa81135be56c69cabbb6/cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87", size = 454565 }, + { url = "https://files.pythonhosted.org/packages/74/06/90b8a44abf3556599cdec107f7290277ae8901a58f75e6fe8f970cd72418/cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0", size = 435635 }, + { url = "https://files.pythonhosted.org/packages/bd/62/a1f468e5708a70b1d86ead5bab5520861d9c7eacce4a885ded9faa7729c3/cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3", size = 445218 }, + { url = "https://files.pythonhosted.org/packages/5b/95/b34462f3ccb09c2594aa782d90a90b045de4ff1f70148ee79c69d37a0a5a/cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595", size = 460486 }, + { url = "https://files.pythonhosted.org/packages/fc/fc/a1e4bebd8d680febd29cf6c8a40067182b64f00c7d105f8f26b5bc54317b/cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a", size = 437911 }, + { url = "https://files.pythonhosted.org/packages/e6/c3/21cab7a6154b6a5ea330ae80de386e7665254835b9e98ecc1340b3a7de9a/cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e", size = 460632 }, + { url = "https://files.pythonhosted.org/packages/cb/b5/fd9f8b5a84010ca169ee49f4e4ad6f8c05f4e3545b72ee041dbbcb159882/cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7", size = 171820 }, + { url = "https://files.pythonhosted.org/packages/8c/52/b08750ce0bce45c143e1b5d7357ee8c55341b52bdef4b0f081af1eb248c2/cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662", size = 181290 }, +] + [[package]] name = "charset-normalizer" version = "3.4.1" @@ -235,9 +312,46 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] +[[package]] +name = "cryptography" +version = "44.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/4c/45dfa6829acffa344e3967d6006ee4ae8be57af746ae2eba1c431949b32c/cryptography-44.0.0.tar.gz", hash = "sha256:cd4e834f340b4293430701e772ec543b0fbe6c2dea510a5286fe0acabe153a02", size = 710657 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/09/8cc67f9b84730ad330b3b72cf867150744bf07ff113cda21a15a1c6d2c7c/cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123", size = 6541833 }, + { url = "https://files.pythonhosted.org/packages/7e/5b/3759e30a103144e29632e7cb72aec28cedc79e514b2ea8896bb17163c19b/cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092", size = 3922710 }, + { url = "https://files.pythonhosted.org/packages/5f/58/3b14bf39f1a0cfd679e753e8647ada56cddbf5acebffe7db90e184c76168/cryptography-44.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831c3c4d0774e488fdc83a1923b49b9957d33287de923d58ebd3cec47a0ae43f", size = 4137546 }, + { url = "https://files.pythonhosted.org/packages/98/65/13d9e76ca19b0ba5603d71ac8424b5694415b348e719db277b5edc985ff5/cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb", size = 3915420 }, + { url = "https://files.pythonhosted.org/packages/b1/07/40fe09ce96b91fc9276a9ad272832ead0fddedcba87f1190372af8e3039c/cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b", size = 4154498 }, + { url = "https://files.pythonhosted.org/packages/75/ea/af65619c800ec0a7e4034207aec543acdf248d9bffba0533342d1bd435e1/cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543", size = 3932569 }, + { url = "https://files.pythonhosted.org/packages/c7/af/d1deb0c04d59612e3d5e54203159e284d3e7a6921e565bb0eeb6269bdd8a/cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e", size = 4016721 }, + { url = "https://files.pythonhosted.org/packages/bd/69/7ca326c55698d0688db867795134bdfac87136b80ef373aaa42b225d6dd5/cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e", size = 4240915 }, + { url = "https://files.pythonhosted.org/packages/ef/d4/cae11bf68c0f981e0413906c6dd03ae7fa864347ed5fac40021df1ef467c/cryptography-44.0.0-cp37-abi3-win32.whl", hash = "sha256:eb33480f1bad5b78233b0ad3e1b0be21e8ef1da745d8d2aecbb20671658b9053", size = 2757925 }, + { url = "https://files.pythonhosted.org/packages/64/b1/50d7739254d2002acae64eed4fc43b24ac0cc44bf0a0d388d1ca06ec5bb1/cryptography-44.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:abc998e0c0eee3c8a1904221d3f67dcfa76422b23620173e28c11d3e626c21bd", size = 3202055 }, + { url = "https://files.pythonhosted.org/packages/11/18/61e52a3d28fc1514a43b0ac291177acd1b4de00e9301aaf7ef867076ff8a/cryptography-44.0.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:660cb7312a08bc38be15b696462fa7cc7cd85c3ed9c576e81f4dc4d8b2b31591", size = 6542801 }, + { url = "https://files.pythonhosted.org/packages/1a/07/5f165b6c65696ef75601b781a280fc3b33f1e0cd6aa5a92d9fb96c410e97/cryptography-44.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1923cb251c04be85eec9fda837661c67c1049063305d6be5721643c22dd4e2b7", size = 3922613 }, + { url = "https://files.pythonhosted.org/packages/28/34/6b3ac1d80fc174812486561cf25194338151780f27e438526f9c64e16869/cryptography-44.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:404fdc66ee5f83a1388be54300ae978b2efd538018de18556dde92575e05defc", size = 4137925 }, + { url = "https://files.pythonhosted.org/packages/d0/c7/c656eb08fd22255d21bc3129625ed9cd5ee305f33752ef2278711b3fa98b/cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289", size = 3915417 }, + { url = "https://files.pythonhosted.org/packages/ef/82/72403624f197af0db6bac4e58153bc9ac0e6020e57234115db9596eee85d/cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7", size = 4155160 }, + { url = "https://files.pythonhosted.org/packages/a2/cd/2f3c440913d4329ade49b146d74f2e9766422e1732613f57097fea61f344/cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c", size = 3932331 }, + { url = "https://files.pythonhosted.org/packages/7f/df/8be88797f0a1cca6e255189a57bb49237402b1880d6e8721690c5603ac23/cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64", size = 4017372 }, + { url = "https://files.pythonhosted.org/packages/af/36/5ccc376f025a834e72b8e52e18746b927f34e4520487098e283a719c205e/cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285", size = 4239657 }, + { url = "https://files.pythonhosted.org/packages/46/b0/f4f7d0d0bcfbc8dd6296c1449be326d04217c57afb8b2594f017eed95533/cryptography-44.0.0-cp39-abi3-win32.whl", hash = "sha256:eca27345e1214d1b9f9490d200f9db5a874479be914199194e746c893788d417", size = 2758672 }, + { url = "https://files.pythonhosted.org/packages/97/9b/443270b9210f13f6ef240eff73fd32e02d381e7103969dc66ce8e89ee901/cryptography-44.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:708ee5f1bafe76d041b53a4f95eb28cdeb8d18da17e597d46d7833ee59b97ede", size = 3202071 }, + { url = "https://files.pythonhosted.org/packages/77/d4/fea74422326388bbac0c37b7489a0fcb1681a698c3b875959430ba550daa/cryptography-44.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:37d76e6863da3774cd9db5b409a9ecfd2c71c981c38788d3fcfaf177f447b731", size = 3338857 }, + { url = "https://files.pythonhosted.org/packages/1a/aa/ba8a7467c206cb7b62f09b4168da541b5109838627f582843bbbe0235e8e/cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:f677e1268c4e23420c3acade68fac427fffcb8d19d7df95ed7ad17cdef8404f4", size = 3850615 }, + { url = "https://files.pythonhosted.org/packages/89/fa/b160e10a64cc395d090105be14f399b94e617c879efd401188ce0fea39ee/cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5e7cb1e5e56ca0933b4873c0220a78b773b24d40d186b6738080b73d3d0a756", size = 4081622 }, + { url = "https://files.pythonhosted.org/packages/47/8f/20ff0656bb0cf7af26ec1d01f780c5cfbaa7666736063378c5f48558b515/cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:8b3e6eae66cf54701ee7d9c83c30ac0a1e3fa17be486033000f2a73a12ab507c", size = 3867546 }, + { url = "https://files.pythonhosted.org/packages/38/d9/28edf32ee2fcdca587146bcde90102a7319b2f2c690edfa627e46d586050/cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:be4ce505894d15d5c5037167ffb7f0ae90b7be6f2a98f9a5c3442395501c32fa", size = 4090937 }, + { url = "https://files.pythonhosted.org/packages/cc/9d/37e5da7519de7b0b070a3fedd4230fe76d50d2a21403e0f2153d70ac4163/cryptography-44.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:62901fb618f74d7d81bf408c8719e9ec14d863086efe4185afd07c352aee1d2c", size = 3128774 }, +] + [[package]] name = "datafusion" -version = "43.0.0" +version = "44.0.0" source = { editable = "." } dependencies = [ { name = "pyarrow", version = "17.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, @@ -249,6 +363,7 @@ dependencies = [ dev = [ { name = "maturin" }, { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "pygithub" }, { name = "pytest" }, { name = "ruff" }, { name = "toml" }, @@ -282,6 +397,7 @@ requires-dist = [ dev = [ { name = "maturin", specifier = ">=1.8.1" }, { name = "numpy", marker = "python_full_version >= '3.10'", specifier = ">1.24.4" }, + { name = "pygithub", specifier = "==2.5.0" }, { name = "pytest", specifier = ">=7.4.4" }, { name = "ruff", specifier = ">=0.9.1" }, { name = "toml", specifier = ">=0.10.2" }, @@ -307,6 +423,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 }, ] +[[package]] +name = "deprecated" +version = "1.2.18" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/97/06afe62762c9a8a86af0cfb7bfdab22a43ad17138b07af5b1a58442690a2/deprecated-1.2.18.tar.gz", hash = "sha256:422b6f6d859da6f2ef57857761bfb392480502a64c3028ca9bbe86085d72115d", size = 2928744 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 }, +] + [[package]] name = "docutils" version = "0.20.1" @@ -1189,6 +1317,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/d8/94161a7ca5c55199484e926165e9e33f318ea1d1b0d7cdbcbc3652b933ec/pyarrow-18.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:a1880dd6772b685e803011a6b43a230c23b566859a6e0c9a276c1e0faf4f4052", size = 25301373 }, ] +[[package]] +name = "pycparser" +version = "2.22" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/b2/31537cf4b1ca988837256c910a668b553fceb8f069bedc4b1c826024b52c/pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6", size = 172736 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 }, +] + [[package]] name = "pydata-sphinx-theme" version = "0.8.0" @@ -1206,6 +1343,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/91/26/0694318d46c7d90ab602ae27b24431e939f1600f9a4c69d1e727ec57289f/pydata_sphinx_theme-0.8.0-py3-none-any.whl", hash = "sha256:fbcbb833a07d3ad8dd997dd40dc94da18d98b41c68123ab0182b58fe92271204", size = 3284997 }, ] +[[package]] +name = "pygithub" +version = "2.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecated" }, + { name = "pyjwt", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version < '3.9'" }, + { name = "pyjwt", version = "2.10.1", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version >= '3.9'" }, + { name = "pynacl" }, + { name = "requests" }, + { name = "typing-extensions" }, + { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/16/ce/aa91d30040d9552c274e7ea8bd10a977600d508d579a4bb262b95eccf961/pygithub-2.5.0.tar.gz", hash = "sha256:e1613ac508a9be710920d26eb18b1905ebd9926aa49398e88151c1b526aad3cf", size = 3552804 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/05/bfbdbbc5d8aafd8dae9b3b6877edca561fccd8528ef5edc4e7b6d23721b5/PyGithub-2.5.0-py3-none-any.whl", hash = "sha256:b0b635999a658ab8e08720bdd3318893ff20e2275f6446fcf35bf3f44f2c0fd2", size = 375935 }, +] + [[package]] name = "pygments" version = "2.19.1" @@ -1215,6 +1371,63 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, ] +[[package]] +name = "pyjwt" +version = "2.9.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/fb/68/ce067f09fca4abeca8771fe667d89cc347d1e99da3e093112ac329c6020e/pyjwt-2.9.0.tar.gz", hash = "sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c", size = 78825 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/79/84/0fdf9b18ba31d69877bd39c9cd6052b47f3761e9910c15de788e519f079f/PyJWT-2.9.0-py3-none-any.whl", hash = "sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850", size = 22344 }, +] + +[package.optional-dependencies] +crypto = [ + { name = "cryptography", marker = "python_full_version < '3.9'" }, +] + +[[package]] +name = "pyjwt" +version = "2.10.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/e7/46/bd74733ff231675599650d3e47f361794b22ef3e3770998dda30d3b63726/pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953", size = 87785 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997 }, +] + +[package.optional-dependencies] +crypto = [ + { name = "cryptography", marker = "python_full_version >= '3.9'" }, +] + +[[package]] +name = "pynacl" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a7/22/27582568be639dfe22ddb3902225f91f2f17ceff88ce80e4db396c8986da/PyNaCl-1.5.0.tar.gz", hash = "sha256:8ac7448f09ab85811607bdd21ec2464495ac8b7c66d146bf545b0f08fb9220ba", size = 3392854 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/75/0b8ede18506041c0bf23ac4d8e2971b4161cd6ce630b177d0a08eb0d8857/PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:401002a4aaa07c9414132aaed7f6836ff98f59277a234704ff66878c2ee4a0d1", size = 349920 }, + { url = "https://files.pythonhosted.org/packages/59/bb/fddf10acd09637327a97ef89d2a9d621328850a72f1fdc8c08bdf72e385f/PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:52cb72a79269189d4e0dc537556f4740f7f0a9ec41c1322598799b0bdad4ef92", size = 601722 }, + { url = "https://files.pythonhosted.org/packages/5d/70/87a065c37cca41a75f2ce113a5a2c2aa7533be648b184ade58971b5f7ccc/PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a36d4a9dda1f19ce6e03c9a784a2921a4b726b02e1c736600ca9c22029474394", size = 680087 }, + { url = "https://files.pythonhosted.org/packages/ee/87/f1bb6a595f14a327e8285b9eb54d41fef76c585a0edef0a45f6fc95de125/PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0c84947a22519e013607c9be43706dd42513f9e6ae5d39d3613ca1e142fba44d", size = 856678 }, + { url = "https://files.pythonhosted.org/packages/66/28/ca86676b69bf9f90e710571b67450508484388bfce09acf8a46f0b8c785f/PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06b8f6fa7f5de8d5d2f7573fe8c863c051225a27b61e6860fd047b1775807858", size = 1133660 }, + { url = "https://files.pythonhosted.org/packages/3d/85/c262db650e86812585e2bc59e497a8f59948a005325a11bbbc9ecd3fe26b/PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a422368fc821589c228f4c49438a368831cb5bbc0eab5ebe1d7fac9dded6567b", size = 663824 }, + { url = "https://files.pythonhosted.org/packages/fd/1a/cc308a884bd299b651f1633acb978e8596c71c33ca85e9dc9fa33a5399b9/PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:61f642bf2378713e2c2e1de73444a3778e5f0a38be6fee0fe532fe30060282ff", size = 1117912 }, + { url = "https://files.pythonhosted.org/packages/25/2d/b7df6ddb0c2a33afdb358f8af6ea3b8c4d1196ca45497dd37a56f0c122be/PyNaCl-1.5.0-cp36-abi3-win32.whl", hash = "sha256:e46dae94e34b085175f8abb3b0aaa7da40767865ac82c928eeb9e57e1ea8a543", size = 204624 }, + { url = "https://files.pythonhosted.org/packages/5e/22/d3db169895faaf3e2eda892f005f433a62db2decbcfbc2f61e6517adfa87/PyNaCl-1.5.0-cp36-abi3-win_amd64.whl", hash = "sha256:20f42270d27e1b6a29f54032090b972d97f0a1b0948cc52392041ef7831fee93", size = 212141 }, +] + [[package]] name = "pytest" version = "8.3.4" @@ -1817,6 +2030,92 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, ] +[[package]] +name = "wrapt" +version = "1.17.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/fc/e91cc220803d7bc4db93fb02facd8461c37364151b8494762cc88b0fbcef/wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3", size = 55531 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/d1/1daec934997e8b160040c78d7b31789f19b122110a75eca3d4e8da0049e1/wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984", size = 53307 }, + { url = "https://files.pythonhosted.org/packages/1b/7b/13369d42651b809389c1a7153baa01d9700430576c81a2f5c5e460df0ed9/wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22", size = 38486 }, + { url = "https://files.pythonhosted.org/packages/62/bf/e0105016f907c30b4bd9e377867c48c34dc9c6c0c104556c9c9126bd89ed/wrapt-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80dd7db6a7cb57ffbc279c4394246414ec99537ae81ffd702443335a61dbf3a7", size = 38777 }, + { url = "https://files.pythonhosted.org/packages/27/70/0f6e0679845cbf8b165e027d43402a55494779295c4b08414097b258ac87/wrapt-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a6e821770cf99cc586d33833b2ff32faebdbe886bd6322395606cf55153246c", size = 83314 }, + { url = "https://files.pythonhosted.org/packages/0f/77/0576d841bf84af8579124a93d216f55d6f74374e4445264cb378a6ed33eb/wrapt-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b60fb58b90c6d63779cb0c0c54eeb38941bae3ecf7a73c764c52c88c2dcb9d72", size = 74947 }, + { url = "https://files.pythonhosted.org/packages/90/ec/00759565518f268ed707dcc40f7eeec38637d46b098a1f5143bff488fe97/wrapt-1.17.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b870b5df5b71d8c3359d21be8f0d6c485fa0ebdb6477dda51a1ea54a9b558061", size = 82778 }, + { url = "https://files.pythonhosted.org/packages/f8/5a/7cffd26b1c607b0b0c8a9ca9d75757ad7620c9c0a9b4a25d3f8a1480fafc/wrapt-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4011d137b9955791f9084749cba9a367c68d50ab8d11d64c50ba1688c9b457f2", size = 81716 }, + { url = "https://files.pythonhosted.org/packages/7e/09/dccf68fa98e862df7e6a60a61d43d644b7d095a5fc36dbb591bbd4a1c7b2/wrapt-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1473400e5b2733e58b396a04eb7f35f541e1fb976d0c0724d0223dd607e0f74c", size = 74548 }, + { url = "https://files.pythonhosted.org/packages/b7/8e/067021fa3c8814952c5e228d916963c1115b983e21393289de15128e867e/wrapt-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3cedbfa9c940fdad3e6e941db7138e26ce8aad38ab5fe9dcfadfed9db7a54e62", size = 81334 }, + { url = "https://files.pythonhosted.org/packages/4b/0d/9d4b5219ae4393f718699ca1c05f5ebc0c40d076f7e65fd48f5f693294fb/wrapt-1.17.2-cp310-cp310-win32.whl", hash = "sha256:582530701bff1dec6779efa00c516496968edd851fba224fbd86e46cc6b73563", size = 36427 }, + { url = "https://files.pythonhosted.org/packages/72/6a/c5a83e8f61aec1e1aeef939807602fb880e5872371e95df2137142f5c58e/wrapt-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:58705da316756681ad3c9c73fd15499aa4d8c69f9fd38dc8a35e06c12468582f", size = 38774 }, + { url = "https://files.pythonhosted.org/packages/cd/f7/a2aab2cbc7a665efab072344a8949a71081eed1d2f451f7f7d2b966594a2/wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58", size = 53308 }, + { url = "https://files.pythonhosted.org/packages/50/ff/149aba8365fdacef52b31a258c4dc1c57c79759c335eff0b3316a2664a64/wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda", size = 38488 }, + { url = "https://files.pythonhosted.org/packages/65/46/5a917ce85b5c3b490d35c02bf71aedaa9f2f63f2d15d9949cc4ba56e8ba9/wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438", size = 38776 }, + { url = "https://files.pythonhosted.org/packages/ca/74/336c918d2915a4943501c77566db41d1bd6e9f4dbc317f356b9a244dfe83/wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a", size = 83776 }, + { url = "https://files.pythonhosted.org/packages/09/99/c0c844a5ccde0fe5761d4305485297f91d67cf2a1a824c5f282e661ec7ff/wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000", size = 75420 }, + { url = "https://files.pythonhosted.org/packages/b4/b0/9fc566b0fe08b282c850063591a756057c3247b2362b9286429ec5bf1721/wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6", size = 83199 }, + { url = "https://files.pythonhosted.org/packages/9d/4b/71996e62d543b0a0bd95dda485219856def3347e3e9380cc0d6cf10cfb2f/wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b", size = 82307 }, + { url = "https://files.pythonhosted.org/packages/39/35/0282c0d8789c0dc9bcc738911776c762a701f95cfe113fb8f0b40e45c2b9/wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662", size = 75025 }, + { url = "https://files.pythonhosted.org/packages/4f/6d/90c9fd2c3c6fee181feecb620d95105370198b6b98a0770cba090441a828/wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72", size = 81879 }, + { url = "https://files.pythonhosted.org/packages/8f/fa/9fb6e594f2ce03ef03eddbdb5f4f90acb1452221a5351116c7c4708ac865/wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317", size = 36419 }, + { url = "https://files.pythonhosted.org/packages/47/f8/fb1773491a253cbc123c5d5dc15c86041f746ed30416535f2a8df1f4a392/wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3", size = 38773 }, + { url = "https://files.pythonhosted.org/packages/a1/bd/ab55f849fd1f9a58ed7ea47f5559ff09741b25f00c191231f9f059c83949/wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925", size = 53799 }, + { url = "https://files.pythonhosted.org/packages/53/18/75ddc64c3f63988f5a1d7e10fb204ffe5762bc663f8023f18ecaf31a332e/wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392", size = 38821 }, + { url = "https://files.pythonhosted.org/packages/48/2a/97928387d6ed1c1ebbfd4efc4133a0633546bec8481a2dd5ec961313a1c7/wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40", size = 38919 }, + { url = "https://files.pythonhosted.org/packages/73/54/3bfe5a1febbbccb7a2f77de47b989c0b85ed3a6a41614b104204a788c20e/wrapt-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb1d0dbf99411f3d871deb6faa9aabb9d4e744d67dcaaa05399af89d847a91d", size = 88721 }, + { url = "https://files.pythonhosted.org/packages/25/cb/7262bc1b0300b4b64af50c2720ef958c2c1917525238d661c3e9a2b71b7b/wrapt-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d18a4865f46b8579d44e4fe1e2bcbc6472ad83d98e22a26c963d46e4c125ef0b", size = 80899 }, + { url = "https://files.pythonhosted.org/packages/2a/5a/04cde32b07a7431d4ed0553a76fdb7a61270e78c5fd5a603e190ac389f14/wrapt-1.17.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc570b5f14a79734437cb7b0500376b6b791153314986074486e0b0fa8d71d98", size = 89222 }, + { url = "https://files.pythonhosted.org/packages/09/28/2e45a4f4771fcfb109e244d5dbe54259e970362a311b67a965555ba65026/wrapt-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6d9187b01bebc3875bac9b087948a2bccefe464a7d8f627cf6e48b1bbae30f82", size = 86707 }, + { url = "https://files.pythonhosted.org/packages/c6/d2/dcb56bf5f32fcd4bd9aacc77b50a539abdd5b6536872413fd3f428b21bed/wrapt-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9e8659775f1adf02eb1e6f109751268e493c73716ca5761f8acb695e52a756ae", size = 79685 }, + { url = "https://files.pythonhosted.org/packages/80/4e/eb8b353e36711347893f502ce91c770b0b0929f8f0bed2670a6856e667a9/wrapt-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8b2816ebef96d83657b56306152a93909a83f23994f4b30ad4573b00bd11bb9", size = 87567 }, + { url = "https://files.pythonhosted.org/packages/17/27/4fe749a54e7fae6e7146f1c7d914d28ef599dacd4416566c055564080fe2/wrapt-1.17.2-cp312-cp312-win32.whl", hash = "sha256:468090021f391fe0056ad3e807e3d9034e0fd01adcd3bdfba977b6fdf4213ea9", size = 36672 }, + { url = "https://files.pythonhosted.org/packages/15/06/1dbf478ea45c03e78a6a8c4be4fdc3c3bddea5c8de8a93bc971415e47f0f/wrapt-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:ec89ed91f2fa8e3f52ae53cd3cf640d6feff92ba90d62236a81e4e563ac0e991", size = 38865 }, + { url = "https://files.pythonhosted.org/packages/ce/b9/0ffd557a92f3b11d4c5d5e0c5e4ad057bd9eb8586615cdaf901409920b14/wrapt-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6ed6ffac43aecfe6d86ec5b74b06a5be33d5bb9243d055141e8cabb12aa08125", size = 53800 }, + { url = "https://files.pythonhosted.org/packages/c0/ef/8be90a0b7e73c32e550c73cfb2fa09db62234227ece47b0e80a05073b375/wrapt-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35621ae4c00e056adb0009f8e86e28eb4a41a4bfa8f9bfa9fca7d343fe94f998", size = 38824 }, + { url = "https://files.pythonhosted.org/packages/36/89/0aae34c10fe524cce30fe5fc433210376bce94cf74d05b0d68344c8ba46e/wrapt-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a604bf7a053f8362d27eb9fefd2097f82600b856d5abe996d623babd067b1ab5", size = 38920 }, + { url = "https://files.pythonhosted.org/packages/3b/24/11c4510de906d77e0cfb5197f1b1445d4fec42c9a39ea853d482698ac681/wrapt-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbabee4f083b6b4cd282f5b817a867cf0b1028c54d445b7ec7cfe6505057cf8", size = 88690 }, + { url = "https://files.pythonhosted.org/packages/71/d7/cfcf842291267bf455b3e266c0c29dcb675b5540ee8b50ba1699abf3af45/wrapt-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49703ce2ddc220df165bd2962f8e03b84c89fee2d65e1c24a7defff6f988f4d6", size = 80861 }, + { url = "https://files.pythonhosted.org/packages/d5/66/5d973e9f3e7370fd686fb47a9af3319418ed925c27d72ce16b791231576d/wrapt-1.17.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8112e52c5822fc4253f3901b676c55ddf288614dc7011634e2719718eaa187dc", size = 89174 }, + { url = "https://files.pythonhosted.org/packages/a7/d3/8e17bb70f6ae25dabc1aaf990f86824e4fd98ee9cadf197054e068500d27/wrapt-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fee687dce376205d9a494e9c121e27183b2a3df18037f89d69bd7b35bcf59e2", size = 86721 }, + { url = "https://files.pythonhosted.org/packages/6f/54/f170dfb278fe1c30d0ff864513cff526d624ab8de3254b20abb9cffedc24/wrapt-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:18983c537e04d11cf027fbb60a1e8dfd5190e2b60cc27bc0808e653e7b218d1b", size = 79763 }, + { url = "https://files.pythonhosted.org/packages/4a/98/de07243751f1c4a9b15c76019250210dd3486ce098c3d80d5f729cba029c/wrapt-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:703919b1633412ab54bcf920ab388735832fdcb9f9a00ae49387f0fe67dad504", size = 87585 }, + { url = "https://files.pythonhosted.org/packages/f9/f0/13925f4bd6548013038cdeb11ee2cbd4e37c30f8bfd5db9e5a2a370d6e20/wrapt-1.17.2-cp313-cp313-win32.whl", hash = "sha256:abbb9e76177c35d4e8568e58650aa6926040d6a9f6f03435b7a522bf1c487f9a", size = 36676 }, + { url = "https://files.pythonhosted.org/packages/bf/ae/743f16ef8c2e3628df3ddfd652b7d4c555d12c84b53f3d8218498f4ade9b/wrapt-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:69606d7bb691b50a4240ce6b22ebb319c1cfb164e5f6569835058196e0f3a845", size = 38871 }, + { url = "https://files.pythonhosted.org/packages/3d/bc/30f903f891a82d402ffb5fda27ec1d621cc97cb74c16fea0b6141f1d4e87/wrapt-1.17.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4a721d3c943dae44f8e243b380cb645a709ba5bd35d3ad27bc2ed947e9c68192", size = 56312 }, + { url = "https://files.pythonhosted.org/packages/8a/04/c97273eb491b5f1c918857cd26f314b74fc9b29224521f5b83f872253725/wrapt-1.17.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:766d8bbefcb9e00c3ac3b000d9acc51f1b399513f44d77dfe0eb026ad7c9a19b", size = 40062 }, + { url = "https://files.pythonhosted.org/packages/4e/ca/3b7afa1eae3a9e7fefe499db9b96813f41828b9fdb016ee836c4c379dadb/wrapt-1.17.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e496a8ce2c256da1eb98bd15803a79bee00fc351f5dfb9ea82594a3f058309e0", size = 40155 }, + { url = "https://files.pythonhosted.org/packages/89/be/7c1baed43290775cb9030c774bc53c860db140397047cc49aedaf0a15477/wrapt-1.17.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d615e4fe22f4ad3528448c193b218e077656ca9ccb22ce2cb20db730f8d306", size = 113471 }, + { url = "https://files.pythonhosted.org/packages/32/98/4ed894cf012b6d6aae5f5cc974006bdeb92f0241775addad3f8cd6ab71c8/wrapt-1.17.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5aaeff38654462bc4b09023918b7f21790efb807f54c000a39d41d69cf552cb", size = 101208 }, + { url = "https://files.pythonhosted.org/packages/ea/fd/0c30f2301ca94e655e5e057012e83284ce8c545df7661a78d8bfca2fac7a/wrapt-1.17.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a7d15bbd2bc99e92e39f49a04653062ee6085c0e18b3b7512a4f2fe91f2d681", size = 109339 }, + { url = "https://files.pythonhosted.org/packages/75/56/05d000de894c4cfcb84bcd6b1df6214297b8089a7bd324c21a4765e49b14/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e3890b508a23299083e065f435a492b5435eba6e304a7114d2f919d400888cc6", size = 110232 }, + { url = "https://files.pythonhosted.org/packages/53/f8/c3f6b2cf9b9277fb0813418e1503e68414cd036b3b099c823379c9575e6d/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c8b293cd65ad716d13d8dd3624e42e5a19cc2a2f1acc74b30c2c13f15cb61a6", size = 100476 }, + { url = "https://files.pythonhosted.org/packages/a7/b1/0bb11e29aa5139d90b770ebbfa167267b1fc548d2302c30c8f7572851738/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f", size = 106377 }, + { url = "https://files.pythonhosted.org/packages/6a/e1/0122853035b40b3f333bbb25f1939fc1045e21dd518f7f0922b60c156f7c/wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555", size = 37986 }, + { url = "https://files.pythonhosted.org/packages/09/5e/1655cf481e079c1f22d0cabdd4e51733679932718dc23bf2db175f329b76/wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c", size = 40750 }, + { url = "https://files.pythonhosted.org/packages/0c/66/95b9e90e6e1274999b183c9c3f984996d870e933ca9560115bd1cd1d6f77/wrapt-1.17.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c803c401ea1c1c18de70a06a6f79fcc9c5acfc79133e9869e730ad7f8ad8ef9", size = 53234 }, + { url = "https://files.pythonhosted.org/packages/a4/b6/6eced5e2db5924bf6d9223d2bb96b62e00395aae77058e6a9e11bf16b3bd/wrapt-1.17.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f917c1180fdb8623c2b75a99192f4025e412597c50b2ac870f156de8fb101119", size = 38462 }, + { url = "https://files.pythonhosted.org/packages/5d/a4/c8472fe2568978b5532df84273c53ddf713f689d408a4335717ab89547e0/wrapt-1.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ecc840861360ba9d176d413a5489b9a0aff6d6303d7e733e2c4623cfa26904a6", size = 38730 }, + { url = "https://files.pythonhosted.org/packages/3c/70/1d259c6b1ad164eb23ff70e3e452dd1950f96e6473f72b7207891d0fd1f0/wrapt-1.17.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb87745b2e6dc56361bfde481d5a378dc314b252a98d7dd19a651a3fa58f24a9", size = 86225 }, + { url = "https://files.pythonhosted.org/packages/a9/68/6b83367e1afb8de91cbea4ef8e85b58acdf62f034f05d78c7b82afaa23d8/wrapt-1.17.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58455b79ec2661c3600e65c0a716955adc2410f7383755d537584b0de41b1d8a", size = 78055 }, + { url = "https://files.pythonhosted.org/packages/0d/21/09573d2443916705c57fdab85d508f592c0a58d57becc53e15755d67fba2/wrapt-1.17.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4e42a40a5e164cbfdb7b386c966a588b1047558a990981ace551ed7e12ca9c2", size = 85592 }, + { url = "https://files.pythonhosted.org/packages/45/ce/700e17a852dd5dec894e241c72973ea82363486bcc1fb05d47b4fbd1d683/wrapt-1.17.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:91bd7d1773e64019f9288b7a5101f3ae50d3d8e6b1de7edee9c2ccc1d32f0c0a", size = 83906 }, + { url = "https://files.pythonhosted.org/packages/37/14/bd210faf0a66faeb8529d42b6b45a25d6aa6ce25ddfc19168e4161aed227/wrapt-1.17.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bb90fb8bda722a1b9d48ac1e6c38f923ea757b3baf8ebd0c82e09c5c1a0e7a04", size = 76763 }, + { url = "https://files.pythonhosted.org/packages/34/0c/85af70d291f44659c422416f0272046109e785bf6db8c081cfeeae5715c5/wrapt-1.17.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:08e7ce672e35efa54c5024936e559469436f8b8096253404faeb54d2a878416f", size = 83573 }, + { url = "https://files.pythonhosted.org/packages/f8/1e/b215068e824878f69ea945804fa26c176f7c2735a3ad5367d78930bd076a/wrapt-1.17.2-cp38-cp38-win32.whl", hash = "sha256:410a92fefd2e0e10d26210e1dfb4a876ddaf8439ef60d6434f21ef8d87efc5b7", size = 36408 }, + { url = "https://files.pythonhosted.org/packages/52/27/3dd9ad5f1097b33c95d05929e409cc86d7c765cb5437b86694dc8f8e9af0/wrapt-1.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:95c658736ec15602da0ed73f312d410117723914a5c91a14ee4cdd72f1d790b3", size = 38737 }, + { url = "https://files.pythonhosted.org/packages/8a/f4/6ed2b8f6f1c832933283974839b88ec7c983fd12905e01e97889dadf7559/wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a", size = 53308 }, + { url = "https://files.pythonhosted.org/packages/a2/a9/712a53f8f4f4545768ac532619f6e56d5d0364a87b2212531685e89aeef8/wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061", size = 38489 }, + { url = "https://files.pythonhosted.org/packages/fa/9b/e172c8f28a489a2888df18f953e2f6cb8d33b1a2e78c9dfc52d8bf6a5ead/wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82", size = 38776 }, + { url = "https://files.pythonhosted.org/packages/cf/cb/7a07b51762dcd59bdbe07aa97f87b3169766cadf240f48d1cbe70a1be9db/wrapt-1.17.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62c2caa1585c82b3f7a7ab56afef7b3602021d6da34fbc1cf234ff139fed3cd9", size = 83050 }, + { url = "https://files.pythonhosted.org/packages/a5/51/a42757dd41032afd6d8037617aa3bc6803ba971850733b24dfb7d5c627c4/wrapt-1.17.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c958bcfd59bacc2d0249dcfe575e71da54f9dcf4a8bdf89c4cb9a68a1170d73f", size = 74718 }, + { url = "https://files.pythonhosted.org/packages/bf/bb/d552bfe47db02fcfc950fc563073a33500f8108efa5f7b41db2f83a59028/wrapt-1.17.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc78a84e2dfbc27afe4b2bd7c80c8db9bca75cc5b85df52bfe634596a1da846b", size = 82590 }, + { url = "https://files.pythonhosted.org/packages/77/99/77b06b3c3c410dbae411105bf22496facf03a5496bfaca8fbcf9da381889/wrapt-1.17.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ba0f0eb61ef00ea10e00eb53a9129501f52385c44853dbd6c4ad3f403603083f", size = 81462 }, + { url = "https://files.pythonhosted.org/packages/2d/21/cf0bd85ae66f92600829ea1de8e1da778e5e9f6e574ccbe74b66db0d95db/wrapt-1.17.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1e1fe0e6ab7775fd842bc39e86f6dcfc4507ab0ffe206093e76d61cde37225c8", size = 74309 }, + { url = "https://files.pythonhosted.org/packages/6d/16/112d25e9092398a0dd6fec50ab7ac1b775a0c19b428f049785096067ada9/wrapt-1.17.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c86563182421896d73858e08e1db93afdd2b947a70064b813d515d66549e15f9", size = 81081 }, + { url = "https://files.pythonhosted.org/packages/2b/49/364a615a0cc0872685646c495c7172e4fc7bf1959e3b12a1807a03014e05/wrapt-1.17.2-cp39-cp39-win32.whl", hash = "sha256:f393cda562f79828f38a819f4788641ac7c4085f30f1ce1a68672baa686482bb", size = 36423 }, + { url = "https://files.pythonhosted.org/packages/00/ad/5d2c1b34ba3202cd833d9221833e74d6500ce66730974993a8dc9a94fb8c/wrapt-1.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:36ccae62f64235cf8ddb682073a60519426fdd4725524ae38874adf72b5f2aeb", size = 38772 }, + { url = "https://files.pythonhosted.org/packages/2d/82/f56956041adef78f849db6b289b282e72b55ab8045a75abad81898c28d19/wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8", size = 23594 }, +] + [[package]] name = "zipp" version = "3.20.2" From d635d56ecdc0cf2667c01cfcc51f26733ec796dc Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 7 Feb 2025 14:36:54 -0500 Subject: [PATCH 096/248] Chore/upgrade datafusion 45 (#1010) * upgrade dep * resolve errors * match new pyo3 version * upgrade dep * back to 43 * use released v45 * remove unnecessary pyarrow feature * Update unit test return type * fix test_relational_expr --------- Co-authored-by: Tim Saucer --- Cargo.lock | 400 +++++++++++--------- Cargo.toml | 16 +- examples/ffi-table-provider/Cargo.lock | 493 ++++++++++++++----------- examples/ffi-table-provider/Cargo.toml | 14 +- examples/ffi-table-provider/src/lib.rs | 2 +- python/tests/test_expr.py | 3 +- python/tests/test_functions.py | 2 +- src/context.rs | 2 +- src/dataframe.rs | 2 +- 9 files changed, 514 insertions(+), 420 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50809696b..f1b1ed50a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaf3437355979f1e93ba84ba108c38be5767713051f3c8ffbf07c094e2e61f9f" +checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" dependencies = [ "arrow-arith", "arrow-array", @@ -201,24 +201,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31dce77d2985522288edae7206bffd5fc4996491841dda01a13a58415867e681" +checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", "num", ] [[package]] name = "arrow-array" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d45fe6d3faed0435b7313e59a02583b14c6c6339fa7729e94c32a20af319a79" +checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" dependencies = [ "ahash", "arrow-buffer", @@ -233,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b02656a35cc103f28084bc80a0159668e0a680d919cef127bd7e0aaccb06ec1" +checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" dependencies = [ "bytes", "half", @@ -244,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c73c6233c5b5d635a56f6010e6eb1ab9e30e94707db21cea03da317f67d84cf3" +checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" dependencies = [ "arrow-array", "arrow-buffer", @@ -265,28 +264,25 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec222848d70fea5a32af9c3602b08f5d740d5e2d33fbd76bf6fd88759b5b13a7" +checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f2861ffa86f107b8ab577d86cff7c7a490243eabe961ba1e1af4f27542bb79" +checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" dependencies = [ "arrow-buffer", "arrow-schema", @@ -296,13 +292,12 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0270dc511f11bb5fa98a25020ad51a99ca5b08d8a8dfbd17503bb9dba0388f0b" +checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", "flatbuffers", @@ -311,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eff38eeb8a971ad3a4caf62c5d57f0cff8a48b64a55e3207c4fd696a9234aad" +checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" dependencies = [ "arrow-array", "arrow-buffer", @@ -331,26 +326,23 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f202a879d287099139ff0d121e7f55ae5e0efe634b8cf2106ebc27a8715dee" +checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f936954991c360ba762dff23f5dda16300774fafd722353d9683abd97630ae" +checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -360,18 +352,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9579b9d8bce47aa41389fe344f2c6758279983b7c0ebb4013e283e3e91bb450e" +checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7471ba126d0b0aaa24b50a36bc6c25e4e74869a1fd1a5553357027a0b1c8d1f1" +checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" dependencies = [ "ahash", "arrow-array", @@ -383,9 +375,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72993b01cb62507b06f1fb49648d7286c8989ecfabdb7b77a750fcb54410731b" +checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" dependencies = [ "arrow-array", "arrow-buffer", @@ -444,7 +436,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -455,7 +447,7 @@ checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -598,9 +590,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" +checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" [[package]] name = "bzip2" @@ -635,9 +627,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.11" +version = "1.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4730490333d58093109dc02c23174c3f4d490998c3fed3cc8e82d57afedb9cf" +checksum = "755717a7de9ec452bf7f3f1a3099085deabd7f2962b861dae91ecd7a365903d2" dependencies = [ "jobserver", "libc", @@ -874,9 +866,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014fc8c384ecacedaabb3bc8359c2a6c6e9d8f7bea65be3434eccacfc37f52d9" +checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" dependencies = [ "apache-avro", "arrow", @@ -888,7 +880,6 @@ dependencies = [ "bytes", "bzip2 0.5.0", "chrono", - "dashmap", "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", @@ -908,7 +899,7 @@ dependencies = [ "flate2", "futures", "glob", - "itertools", + "itertools 0.14.0", "log", "num-traits", "object_store", @@ -928,31 +919,39 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee60d33e210ef96070377ae667ece7caa0e959c8387496773d4a1a72f1a5012e" +checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" dependencies = [ - "arrow-schema", + "arrow", "async-trait", + "dashmap", "datafusion-common", "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", "parking_lot", + "sqlparser", ] [[package]] name = "datafusion-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b42b7d720fe21ed9cca2ebb635f3f13a12cfab786b41e0fba184fb2e620525b" +checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" dependencies = [ "ahash", "apache-avro", "arrow", "arrow-array", "arrow-buffer", + "arrow-ipc", "arrow-schema", + "base64 0.22.1", "half", "hashbrown 0.14.5", "indexmap", @@ -969,9 +968,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72fbf14d4079f7ce5306393084fe5057dddfdc2113577e0049310afa12e94281" +checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" dependencies = [ "log", "tokio", @@ -979,15 +978,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c278dbd64860ed0bb5240fc1f4cb6aeea437153910aea69bcf7d5a8d6d0454f3" +checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" [[package]] name = "datafusion-execution" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22cb02af47e756468b3cbfee7a83e3d4f2278d452deb4b033ba933c75169486" +checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" dependencies = [ "arrow", "dashmap", @@ -1004,9 +1003,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62298eadb1d15b525df1315e61a71519ffc563d41d5c3b2a30fda2d70f77b93c" +checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" dependencies = [ "arrow", "chrono", @@ -1025,23 +1024,26 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda7f73c5fc349251cd3dcb05773c5bf55d2505a698ef9d38dfc712161ea2f55" +checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" dependencies = [ "arrow", "datafusion-common", - "itertools", + "itertools 0.14.0", + "paste", ] [[package]] name = "datafusion-ffi" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "114e944790756b84c2cc5971eae24f5430980149345601939ac222885d4db5f7" +checksum = "ff47a79d442207c168c6e3e1d970c248589c148e4800e5b285ac1b2cb1a230f8" dependencies = [ "abi_stable", "arrow", + "arrow-array", + "arrow-schema", "async-ffi", "async-trait", "datafusion", @@ -1049,13 +1051,15 @@ dependencies = [ "futures", "log", "prost", + "semver", + "tokio", ] [[package]] name = "datafusion-functions" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd197f3b2975424d3a4898ea46651be855a46721a56727515dbd5c9e2fb597da" +checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" dependencies = [ "arrow", "arrow-buffer", @@ -1071,7 +1075,7 @@ dependencies = [ "datafusion-macros", "hashbrown 0.14.5", "hex", - "itertools", + "itertools 0.14.0", "log", "md-5", "rand", @@ -1083,12 +1087,13 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aabbe48fba18f9981b134124381bee9e46f93518b8ad2f9721ee296cef5affb9" +checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" dependencies = [ "ahash", "arrow", + "arrow-buffer", "arrow-schema", "datafusion-common", "datafusion-doc", @@ -1105,9 +1110,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7a3fefed9c8c11268d446d924baca8cabf52fe32f73fdaa20854bac6473590c" +checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" dependencies = [ "ahash", "arrow", @@ -1118,9 +1123,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6360f27464fab857bec698af39b2ae331dc07c8bf008fb4de387a19cdc6815a5" +checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" dependencies = [ "arrow", "arrow-array", @@ -1128,21 +1133,23 @@ dependencies = [ "arrow-ord", "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-macros", "datafusion-physical-expr-common", - "itertools", + "itertools 0.14.0", "log", "paste", ] [[package]] name = "datafusion-functions-table" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c35c070eb705c12795dab399c3809f4dfbc290678c624d3989490ca9b8449c1" +checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" dependencies = [ "arrow", "async-trait", @@ -1156,9 +1163,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52229bca26b590b140900752226c829f15fc1a99840e1ca3ce1a9534690b82a8" +checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1173,9 +1180,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "367befc303b64a668a10ae6988a064a9289e1999e71a7f8e526b6e14d6bdd9d6" +checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1183,19 +1190,20 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5de3c8f386ea991696553afe241a326ecbc3c98a12c562867e4be754d3a060c" +checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" dependencies = [ + "datafusion-expr", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "datafusion-optimizer" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53b520413906f755910422b016fb73884ae6e9e1b376de4f9584b6c0e031da75" +checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" dependencies = [ "arrow", "chrono", @@ -1203,7 +1211,7 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr", "indexmap", - "itertools", + "itertools 0.14.0", "log", "recursive", "regex", @@ -1212,9 +1220,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acd6ddc378f6ad19af95ccd6790dec8f8e1264bc4c70e99ddc1830c1a1c78ccd" +checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" dependencies = [ "ahash", "arrow", @@ -1229,48 +1237,54 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", "paste", - "petgraph", + "petgraph 0.7.1", ] [[package]] name = "datafusion-physical-expr-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06e6c05458eccd74b4c77ed6a1fe63d52434240711de7f6960034794dad1caf5" +checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" dependencies = [ "ahash", "arrow", + "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", - "itertools", + "itertools 0.14.0", ] [[package]] name = "datafusion-physical-optimizer" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dc3a82190f49c37d377f31317e07ab5d7588b837adadba8ac367baad5dc2351" +checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" dependencies = [ "arrow", + "arrow-schema", "datafusion-common", "datafusion-execution", + "datafusion-expr", "datafusion-expr-common", "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools", + "futures", + "itertools 0.14.0", "log", "recursive", + "url", ] [[package]] name = "datafusion-physical-plan" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6608bc9844b4ddb5ed4e687d173e6c88700b1d0482f43894617d18a1fe75da" +checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" dependencies = [ "ahash", "arrow", @@ -1291,7 +1305,7 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", "parking_lot", "pin-project-lite", @@ -1300,9 +1314,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e23b0998195e495bfa7b37cdceb317129a6c40522219f6872d2e0c9ae9f4fcb" +checksum = "2db5d79f0c974041787b899d24dc91bdab2ff112d1942dd71356a4ce3b407e6c" dependencies = [ "arrow", "chrono", @@ -1316,9 +1330,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfc59992a29eed2d2c1dd779deac99083b217774ebcf90ee121840607a4d866f" +checksum = "de21bde1603aac0ff32cf478e47081be6e3583c6861fe8f57034da911efe7578" dependencies = [ "arrow", "datafusion-common", @@ -1350,9 +1364,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a884061c79b33d0c8e84a6f4f4be8bdc12c0f53f5af28ddf5d6d95ac0b15fdc" +checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" dependencies = [ "arrow", "arrow-array", @@ -1369,16 +1383,16 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ec36dd38512b1ecc7a3bb92e72046b944611b2f0d709445c1e51b0143bffd4" +checksum = "1634405abd8bd3c64c352f2da2f2aec6d80a815930257e0db0ce4ff5daf00944" dependencies = [ "arrow-buffer", "async-recursion", "async-trait", "chrono", "datafusion", - "itertools", + "itertools 0.14.0", "object_store", "pbjson-types", "prost", @@ -1405,7 +1419,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1448,6 +1462,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + [[package]] name = "flatbuffers" version = "24.12.23" @@ -1545,7 +1565,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1938,7 +1958,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1999,6 +2019,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.14" @@ -2362,7 +2391,7 @@ dependencies = [ "httparse", "humantime", "hyper", - "itertools", + "itertools 0.13.0", "md-5", "parking_lot", "percent-encoding", @@ -2382,9 +2411,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.2" +version = "1.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" [[package]] name = "openssl-probe" @@ -2426,9 +2455,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8957c0c95a6a1804f3e51a18f69df29be53856a8c5768cc9b6d00fcafcd2917c" +checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" dependencies = [ "ahash", "arrow-array", @@ -2452,6 +2481,7 @@ dependencies = [ "object_store", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "tokio", @@ -2492,7 +2522,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" dependencies = [ "heck", - "itertools", + "itertools 0.13.0", "prost", "prost-types", ] @@ -2524,7 +2554,17 @@ version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ - "fixedbitset", + "fixedbitset 0.4.2", + "indexmap", +] + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset 0.5.7", "indexmap", ] @@ -2606,7 +2646,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" dependencies = [ "proc-macro2", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2635,16 +2675,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b" dependencies = [ "heck", - "itertools", + "itertools 0.13.0", "log", "multimap", "once_cell", - "petgraph", + "petgraph 0.6.5", "prettyplease", "prost", "prost-types", "regex", - "syn 2.0.96", + "syn 2.0.98", "tempfile", ] @@ -2655,10 +2695,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" dependencies = [ "anyhow", - "itertools", + "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2690,9 +2730,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" +checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc" dependencies = [ "cfg-if", "indoc", @@ -2708,9 +2748,9 @@ dependencies = [ [[package]] name = "pyo3-async-runtimes" -version = "0.22.0" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2529f0be73ffd2be0cc43c013a640796558aa12d7ca0aab5cc14f375b4733031" +checksum = "977dc837525cfd22919ba6a831413854beb7c99a256c03bf8624ad707e45810e" dependencies = [ "futures", "once_cell", @@ -2721,9 +2761,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" +checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7" dependencies = [ "once_cell", "target-lexicon", @@ -2731,9 +2771,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" +checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d" dependencies = [ "libc", "pyo3-build-config", @@ -2741,27 +2781,27 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" +checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "pyo3-macros-backend" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" +checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4" dependencies = [ "heck", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2888,7 +2928,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3029,9 +3069,9 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustc_version" @@ -3161,7 +3201,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3234,7 +3274,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3245,7 +3285,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3269,7 +3309,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3314,6 +3354,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "1.0.1" @@ -3353,7 +3399,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3396,7 +3442,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3440,14 +3486,14 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "substrait" -version = "0.50.4" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1772d041c37cc7e6477733c76b2acf4ee36bd52b2ae4d9ea0ec9c87d003db32" +checksum = "5db15789cecbfdf6b1fcf2db807e767c92273bdc407ac057c2194b070c597756" dependencies = [ "heck", "pbjson", @@ -3464,7 +3510,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.96", + "syn 2.0.98", "typify", "walkdir", ] @@ -3488,9 +3534,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.96" +version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", @@ -3514,7 +3560,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3563,7 +3609,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3574,7 +3620,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3646,7 +3692,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3718,7 +3764,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3784,7 +3830,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3795,9 +3841,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "typify" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c644dda9862f0fef3a570d8ddb3c2cfb1d5ac824a1f2ddfa7bc8f071a5ad8a" +checksum = "e03ba3643450cfd95a1aca2e1938fef63c1c1994489337998aff4ad771f21ef8" dependencies = [ "typify-impl", "typify-macro", @@ -3805,9 +3851,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59ab345b6c0d8ae9500b9ff334a4c7c0d316c1c628dc55726b95887eb8dbd11" +checksum = "bce48219a2f3154aaa2c56cbf027728b24a3c8fe0a47ed6399781de2b3f3eeaf" dependencies = [ "heck", "log", @@ -3818,16 +3864,16 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.96", - "thiserror 1.0.69", + "syn 2.0.98", + "thiserror 2.0.11", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "785e2cdcef0df8160fdd762ed548a637aaec1e83704fdbc14da0df66013ee8d0" +checksum = "68b5780d745920ed73c5b7447496a9b5c42ed2681a9b70859377aec423ecf02b" dependencies = [ "proc-macro2", "quote", @@ -3836,7 +3882,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.96", + "syn 2.0.98", "typify-impl", ] @@ -3901,11 +3947,11 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.12.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" +checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.3.1", "serde", ] @@ -3971,7 +4017,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "wasm-bindgen-shared", ] @@ -4006,7 +4052,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4255,7 +4301,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "synstructure", ] @@ -4277,7 +4323,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4297,7 +4343,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "synstructure", ] @@ -4326,7 +4372,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 44e6e2244..d18e0e8f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,13 +35,13 @@ substrait = ["dep:datafusion-substrait"] [dependencies] tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } -pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } -pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]} -arrow = { version = "53", features = ["pyarrow"] } -datafusion = { version = "44.0.0", features = ["avro", "unicode_expressions"] } -datafusion-substrait = { version = "44.0.0", optional = true } -datafusion-proto = { version = "44.0.0" } -datafusion-ffi = { version = "44.0.0" } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} +arrow = { version = "54", features = ["pyarrow"] } +datafusion = { version = "45.0.0", features = ["avro", "unicode_expressions"] } +datafusion-substrait = { version = "45.0.0", optional = true } +datafusion-proto = { version = "45.0.0" } +datafusion-ffi = { version = "45.0.0" } prost = "0.13" # keep in line with `datafusion-substrait` uuid = { version = "1.12", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } @@ -52,7 +52,7 @@ url = "2" [build-dependencies] prost-types = "0.13" # keep in line with `datafusion-substrait` -pyo3-build-config = "0.22" +pyo3-build-config = "0.23" [lib] name = "datafusion_python" diff --git a/examples/ffi-table-provider/Cargo.lock b/examples/ffi-table-provider/Cargo.lock index 3b57cac75..32af85180 100644 --- a/examples/ffi-table-provider/Cargo.lock +++ b/examples/ffi-table-provider/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "abi_stable" @@ -144,9 +144,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4caf25cdc4a985f91df42ed9e9308e1adbcd341a31a72605c697033fcef163e3" +checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" dependencies = [ "arrow-arith", "arrow-array", @@ -165,24 +165,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91f2dfd1a7ec0aca967dfaa616096aec49779adc8eccec005e2f5e4111b1192a" +checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", "num", ] [[package]] name = "arrow-array" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d39387ca628be747394890a6e47f138ceac1aa912eab64f02519fed24b637af8" +checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" dependencies = [ "ahash", "arrow-buffer", @@ -191,15 +190,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.14.5", + "hashbrown 0.15.1", "num", ] [[package]] name = "arrow-buffer" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e51e05228852ffe3eb391ce7178a0f97d2cf80cc6ef91d3c4a6b3cb688049ec" +checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" dependencies = [ "bytes", "half", @@ -208,9 +207,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d09aea56ec9fa267f3f3f6cdab67d8a9974cbba90b3aa38c8fe9d0bb071bd8c1" +checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" dependencies = [ "arrow-array", "arrow-buffer", @@ -229,28 +228,25 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c07b5232be87d115fde73e32f2ca7f1b353bff1b44ac422d3c6fc6ae38f11f0d" +checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b98ae0af50890b494cebd7d6b04b35e896205c1d1df7b29a6272c5d0d0249ef5" +checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" dependencies = [ "arrow-buffer", "arrow-schema", @@ -260,13 +256,12 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ed91bdeaff5a1c00d28d8f73466bcb64d32bbd7093b5a30156b4b9f4dba3eee" +checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", "flatbuffers", @@ -275,9 +270,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0471f51260a5309307e5d409c9dc70aede1cd9cf1d4ff0f0a1e8e1a2dd0e0d3c" +checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" dependencies = [ "arrow-array", "arrow-buffer", @@ -295,26 +290,23 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2883d7035e0b600fb4c30ce1e50e66e53d8656aa729f2bfa4b51d359cf3ded52" +checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552907e8e587a6fde4f8843fd7a27a576a260f65dab6c065741ea79f633fc5be" +checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -324,18 +316,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "539ada65246b949bd99ffa0881a9a15a4a529448af1a07a9838dd78617dafab1" +checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "arrow-select" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6259e566b752da6dceab91766ed8b2e67bf6270eb9ad8a6e07a33c1bede2b125" +checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" dependencies = [ "ahash", "arrow-array", @@ -347,9 +339,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3179ccbd18ebf04277a095ba7321b93fd1f774f18816bd5f6b3ce2f594edb6c" +checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" dependencies = [ "arrow-array", "arrow-buffer", @@ -380,10 +372,9 @@ version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" dependencies = [ - "bzip2", + "bzip2 0.4.4", "flate2", "futures-core", - "futures-io", "memchr", "pin-project-lite", "tokio", @@ -448,6 +439,19 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bigdecimal" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f31f3af01c5c65a07985c804d3366560e6fa7883d640a122819b14ec327482c" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -540,6 +544,16 @@ dependencies = [ "libc", ] +[[package]] +name = "bzip2" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bafdbf26611df8c14810e268ddceda071c297570a5fb360ceddf617fe417ef58" +dependencies = [ + "bzip2-sys", + "libc", +] + [[package]] name = "bzip2-sys" version = "0.1.11+1.0.8" @@ -751,11 +765,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbba0799cf6913b456ed07a94f0f3b6e12c62a5d88b10809e2284a0f2b915c05" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ - "ahash", "arrow", "arrow-array", "arrow-ipc", @@ -763,9 +775,8 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "bzip2", + "bzip2 0.5.0", "chrono", - "dashmap", "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", @@ -774,6 +785,7 @@ dependencies = [ "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", + "datafusion-functions-table", "datafusion-functions-window", "datafusion-optimizer", "datafusion-physical-expr", @@ -784,18 +796,13 @@ dependencies = [ "flate2", "futures", "glob", - "half", - "hashbrown 0.14.5", - "indexmap", - "itertools", + "itertools 0.14.0", "log", - "num_cpus", "object_store", "parking_lot", "parquet", - "paste", - "pin-project-lite", "rand", + "regex", "sqlparser", "tempfile", "tokio", @@ -808,67 +815,74 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7493c5c2d40eec435b13d92e5703554f4efc7059451fcb8d3a79580ff0e45560" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ - "arrow-schema", + "arrow", "async-trait", + "dashmap", "datafusion-common", "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", "parking_lot", + "sqlparser", ] [[package]] name = "datafusion-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24953049ebbd6f8964f91f60aa3514e121b5e81e068e33b60e77815ab369b25c" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", + "arrow-ipc", "arrow-schema", - "chrono", + "base64", "half", "hashbrown 0.14.5", "indexmap", - "instant", "libc", - "num_cpus", + "log", "object_store", "parquet", "paste", + "recursive", "sqlparser", "tokio", + "web-time", ] [[package]] name = "datafusion-common-runtime" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f06df4ef76872e11c924d3c814fd2a8dd09905ed2e2195f71c857d78abd19685" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "log", "tokio", ] +[[package]] +name = "datafusion-doc" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" + [[package]] name = "datafusion-execution" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bbdcb628d690f3ce5fea7de81642b514486d58ff9779a51f180a69a4eadb361" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", - "chrono", "dashmap", "datafusion-common", "datafusion-expr", "futures", - "hashbrown 0.14.5", "log", "object_store", "parking_lot", @@ -879,63 +893,59 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8036495980e3131f706b7d33ab00b4492d73dc714e3cb74d11b50f9602a73246" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ - "ahash", "arrow", - "arrow-array", - "arrow-buffer", "chrono", "datafusion-common", + "datafusion-doc", "datafusion-expr-common", "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", "indexmap", "paste", + "recursive", "serde_json", "sqlparser", - "strum", - "strum_macros", ] [[package]] name = "datafusion-expr-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4da0f3cb4669f9523b403d6b5a0ec85023e0ab3bf0183afd1517475b3e64fdd2" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "datafusion-common", - "itertools", + "itertools 0.14.0", "paste", ] [[package]] name = "datafusion-ffi" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e923c459b53a26d92a8806d1f6a37fdf48bde51507a39eaed6f42a60f2bfd160" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "abi_stable", "arrow", + "arrow-array", + "arrow-schema", "async-ffi", "async-trait", "datafusion", "datafusion-proto", - "doc-comment", "futures", "log", "prost", + "semver", + "tokio", ] [[package]] name = "datafusion-functions" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52c4012648b34853e40a2c6bcaa8772f837831019b68aca384fb38436dba162" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "arrow-buffer", @@ -944,11 +954,14 @@ dependencies = [ "blake3", "chrono", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", "hashbrown 0.14.5", "hex", - "itertools", + "itertools 0.14.0", "log", "md-5", "rand", @@ -960,44 +973,42 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5b8bb624597ba28ed7446df4a9bd7c7a7bde7c578b6b527da3f47371d5f6741" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", + "arrow-buffer", "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions-aggregate-common", + "datafusion-macros", "datafusion-physical-expr", "datafusion-physical-expr-common", "half", - "indexmap", "log", "paste", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb06208fc470bc8cf1ce2d9a1159d42db591f2c7264a8c1776b53ad8f675143" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", - "rand", ] [[package]] name = "datafusion-functions-nested" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca25bbb87323716d05e54114666e942172ccca23c5a507e9c7851db6e965317" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "arrow-array", @@ -1005,26 +1016,43 @@ dependencies = [ "arrow-ord", "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-macros", "datafusion-physical-expr-common", - "itertools", + "itertools 0.14.0", "log", "paste", - "rand", +] + +[[package]] +name = "datafusion-functions-table" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", ] [[package]] name = "datafusion-functions-window" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ae23356c634e54c59f7c51acb7a5b9f6240ffb2cf997049a1a24a8a88598dbe" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "datafusion-common", + "datafusion-doc", "datafusion-expr", "datafusion-functions-window-common", + "datafusion-macros", "datafusion-physical-expr", "datafusion-physical-expr-common", "log", @@ -1033,48 +1061,51 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4b3d6ff7794acea026de36007077a06b18b89e4f9c3fea7f2215f9f7dd9059b" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", ] +[[package]] +name = "datafusion-macros" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +dependencies = [ + "datafusion-expr", + "quote", + "syn 2.0.87", +] + [[package]] name = "datafusion-optimizer" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec6241eb80c595fa0e1a8a6b69686b5cf3bd5fdacb8319582a0943b0bd788aa" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", - "async-trait", "chrono", "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", - "paste", + "recursive", + "regex", "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3370357b8fc75ec38577700644e5d1b0bc78f38babab99c0b8bd26bafb3e4335" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", - "arrow-ord", "arrow-schema", - "arrow-string", - "chrono", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1083,7 +1114,7 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", "paste", "petgraph", @@ -1091,39 +1122,43 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8b7734d94bf2fa6f6e570935b0ddddd8421179ce200065be97874e13d46a47b" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", + "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", - "rand", + "itertools 0.14.0", ] [[package]] name = "datafusion-physical-optimizer" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eee8c479522df21d7b395640dff88c5ed05361852dce6544d7c98e9dbcebffe" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "arrow-schema", "datafusion-common", "datafusion-execution", + "datafusion-expr", "datafusion-expr-common", "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools", + "futures", + "itertools 0.14.0", + "log", + "recursive", + "url", ] [[package]] name = "datafusion-physical-plan" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17e1fc2e2c239d14e8556f2622b19a726bf6bc6962cc00c71fc52626274bee24" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", @@ -1137,7 +1172,6 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", - "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -1145,20 +1179,17 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", - "once_cell", "parking_lot", "pin-project-lite", - "rand", "tokio", ] [[package]] name = "datafusion-proto" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f730f7fc5a20134d4e5ecdf7bbf392002ac58163d58423ea28a702dc077b06e1" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "chrono", @@ -1172,33 +1203,30 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12c225fe49e4f943e35446b263613ada7a9e9f8d647544e6b07037b9803567df" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", - "chrono", "datafusion-common", - "object_store", "prost", ] [[package]] name = "datafusion-sql" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63e3a4ed41dbee20a5d947a59ca035c225d67dc9cbe869c10f66dcdf25e7ce51" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "arrow-array", "arrow-schema", + "bigdecimal", "datafusion-common", "datafusion-expr", "indexmap", "log", + "recursive", "regex", "sqlparser", - "strum", ] [[package]] @@ -1223,12 +1251,6 @@ dependencies = [ "syn 2.0.87", ] -[[package]] -name = "doc-comment" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" - [[package]] name = "either" version = "1.13.0" @@ -1272,15 +1294,15 @@ dependencies = [ [[package]] name = "fixedbitset" -version = "0.4.2" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "24.3.25" +version = "24.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -1469,12 +1491,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - [[package]] name = "hex" version = "0.4.3" @@ -1651,9 +1667,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown 0.15.1", @@ -1665,18 +1681,6 @@ version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" -[[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", -] - [[package]] name = "integer-encoding" version = "3.0.4" @@ -1692,6 +1696,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -1964,16 +1977,6 @@ dependencies = [ "libm", ] -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "object" version = "0.36.5" @@ -1994,7 +1997,7 @@ dependencies = [ "chrono", "futures", "humantime", - "itertools", + "itertools 0.13.0", "parking_lot", "percent-encoding", "snafu", @@ -2044,9 +2047,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dea02606ba6f5e856561d8d507dba8bac060aefca2a6c0f1aa1d361fed91ff3e" +checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" dependencies = [ "ahash", "arrow-array", @@ -2063,13 +2066,14 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.14.5", + "hashbrown 0.15.1", "lz4_flex", "num", "num-bigint", "object_store", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "tokio", @@ -2101,9 +2105,9 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "petgraph" -version = "0.6.5" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", "indexmap", @@ -2206,17 +2210,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" dependencies = [ "anyhow", - "itertools", + "itertools 0.13.0", "proc-macro2", "quote", "syn 2.0.87", ] +[[package]] +name = "psm" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +dependencies = [ + "cc", +] + [[package]] name = "pyo3" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" +checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc" dependencies = [ "cfg-if", "indoc", @@ -2232,9 +2245,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" +checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7" dependencies = [ "once_cell", "target-lexicon", @@ -2242,9 +2255,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" +checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d" dependencies = [ "libc", "pyo3-build-config", @@ -2252,9 +2265,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" +checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -2264,9 +2277,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" +checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4" dependencies = [ "heck", "proc-macro2", @@ -2314,6 +2327,26 @@ dependencies = [ "getrandom", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.87", +] + [[package]] name = "redox_syscall" version = "0.5.7" @@ -2418,9 +2451,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" -version = "1.0.23" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" [[package]] name = "seq-macro" @@ -2477,6 +2510,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "0.3.11" @@ -2527,9 +2566,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "sqlparser" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fe11944a61da0da3f592e19a45ebe5ab92dc14a779907ff1f08fbb797bfefc7" +checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" dependencies = [ "log", "sqlparser_derive", @@ -2537,9 +2576,9 @@ dependencies = [ [[package]] name = "sqlparser_derive" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", @@ -2552,6 +2591,19 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stacker" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -2563,9 +2615,6 @@ name = "strum" version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" -dependencies = [ - "strum_macros", -] [[package]] name = "strum_macros" @@ -2798,9 +2847,9 @@ checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" [[package]] name = "url" -version = "2.5.3" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -2906,10 +2955,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] -name = "web-sys" -version = "0.3.72" +name = "web-time" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/examples/ffi-table-provider/Cargo.toml b/examples/ffi-table-provider/Cargo.toml index 4e6f91f33..0e558fdd0 100644 --- a/examples/ffi-table-provider/Cargo.toml +++ b/examples/ffi-table-provider/Cargo.toml @@ -21,15 +21,15 @@ version = "0.1.0" edition = "2021" [dependencies] -datafusion = { version = "44.0.0" } -datafusion-ffi = { version = "44.0.0" } -pyo3 = { version = "0.22.6", features = ["extension-module", "abi3", "abi3-py38"] } -arrow = { version = "53.2.0" } -arrow-array = { version = "53.2.0" } -arrow-schema = { version = "53.2.0" } +datafusion = { version = "45.0.0" } +datafusion-ffi = { version = "45.0.0" } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +arrow = { version = "54" } +arrow-array = { version = "54" } +arrow-schema = { version = "54" } [build-dependencies] -pyo3-build-config = "0.22.6" +pyo3-build-config = "0.23" [lib] name = "ffi_table_provider" diff --git a/examples/ffi-table-provider/src/lib.rs b/examples/ffi-table-provider/src/lib.rs index 473244d88..88deeece2 100644 --- a/examples/ffi-table-provider/src/lib.rs +++ b/examples/ffi-table-provider/src/lib.rs @@ -102,7 +102,7 @@ impl MyTableProvider { let provider = self .create_table() .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; - let provider = FFI_TableProvider::new(Arc::new(provider), false); + let provider = FFI_TableProvider::new(Arc::new(provider), false, None); PyCapsule::new_bound(py, provider, Some(name.clone())) } diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 77f88aa44..354c7e180 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -148,8 +148,7 @@ def test_relational_expr(test_ctx): assert df.filter(col("b") == "beta").count() == 1 assert df.filter(col("b") != "beta").count() == 2 - with pytest.raises(Exception): - df.filter(col("a") == "beta").count() + assert df.filter(col("a") == "beta").count() == 0 def test_expr_to_variant(): diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index ad6aa7c0a..796b1f76e 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -732,7 +732,7 @@ def test_array_function_obj_tests(stmt, py_expr): ), ( f.regexp_match(column("a"), literal("(ell|orl)")), - pa.array([["ell"], ["orl"], None]), + pa.array([["ell"], ["orl"], None], type=pa.list_(pa.string_view())), ), ( f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), diff --git a/src/context.rs b/src/context.rs index f53b15576..ebe7db230 100644 --- a/src/context.rs +++ b/src/context.rs @@ -48,7 +48,7 @@ use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; -use datafusion::catalog_common::TableReference; +use datafusion::common::TableReference; use datafusion::common::{exec_err, ScalarValue}; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::file_format::parquet::ParquetFormat; diff --git a/src/dataframe.rs b/src/dataframe.rs index 6fb08ba25..13d7ae838 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -546,7 +546,7 @@ impl PyDataFrame { /// Collect the batches and pass to Arrow Table fn to_arrow_table(&self, py: Python<'_>) -> PyResult { let batches = self.collect(py)?.to_object(py); - let schema: PyObject = self.schema().into_py(py); + let schema: PyObject = self.schema().into_pyobject(py)?.to_object(py); // Instantiate pyarrow Table object and use its from_batches method let table_class = py.import_bound("pyarrow")?.getattr("Table")?; From 40a61c150adee6beb9961302fece81c33639082e Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sun, 16 Feb 2025 02:31:00 +0800 Subject: [PATCH 097/248] add to_timestamp_nanos (#1020) --- python/datafusion/functions.py | 1 + python/tests/test_functions.py | 4 ++++ src/functions.rs | 2 ++ 3 files changed, 7 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 7c2fa9a8f..5c260aade 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -252,6 +252,7 @@ "to_hex", "to_timestamp", "to_timestamp_micros", + "to_timestamp_nanos", "to_timestamp_millis", "to_timestamp_seconds", "to_unixtime", diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 796b1f76e..b1a739b49 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -871,6 +871,7 @@ def test_temporal_functions(df): f.to_timestamp_millis(literal("2023-09-07 05:06:14.523952")), f.to_timestamp_micros(literal("2023-09-07 05:06:14.523952")), f.extract(literal("day"), column("d")), + f.to_timestamp_nanos(literal("2023-09-07 05:06:14.523952")), ) result = df.collect() assert len(result) == 1 @@ -909,6 +910,9 @@ def test_temporal_functions(df): [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") ) assert result.column(10) == pa.array([31, 26, 2], type=pa.int32()) + assert result.column(11) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + ) def test_arrow_cast(df): diff --git a/src/functions.rs b/src/functions.rs index 46c748cf8..6a8abb18d 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -553,6 +553,7 @@ expr_fn!( expr_fn!(now); expr_fn_vec!(to_timestamp); expr_fn_vec!(to_timestamp_millis); +expr_fn_vec!(to_timestamp_nanos); expr_fn_vec!(to_timestamp_micros); expr_fn_vec!(to_timestamp_seconds); expr_fn_vec!(to_unixtime); @@ -977,6 +978,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(to_hex))?; m.add_wrapped(wrap_pyfunction!(to_timestamp))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_millis))?; + m.add_wrapped(wrap_pyfunction!(to_timestamp_nanos))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_micros))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_seconds))?; m.add_wrapped(wrap_pyfunction!(to_unixtime))?; From 3584bec8900bcfb33bcae4b85a3c47a46b82c72e Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 19 Feb 2025 20:50:31 -0500 Subject: [PATCH 098/248] [infra] Fail Clippy on rust build warnings (#1029) * pyo3 update required changes to deprecated interfaces * Substrait feature clippy updates * PyTuple was called twice * add -D warnings option --------- Co-authored-by: Tim Saucer --- .github/workflows/test.yaml | 2 +- .pre-commit-config.yaml | 2 +- src/config.rs | 10 +++--- src/context.rs | 12 +++---- src/dataframe.rs | 17 +++++---- src/dataset.rs | 2 +- src/dataset_exec.rs | 8 ++--- src/errors.rs | 4 +++ src/expr.rs | 61 ++++++++++++++++---------------- src/expr/aggregate.rs | 6 ++-- src/expr/analyze.rs | 6 ++-- src/expr/create_memory_table.rs | 6 ++-- src/expr/create_view.rs | 6 ++-- src/expr/distinct.rs | 6 ++-- src/expr/drop_table.rs | 6 ++-- src/expr/empty_relation.rs | 6 ++-- src/expr/explain.rs | 6 ++-- src/expr/extension.rs | 6 ++-- src/expr/filter.rs | 6 ++-- src/expr/join.rs | 6 ++-- src/expr/limit.rs | 6 ++-- src/expr/literal.rs | 6 ++-- src/expr/logical_node.rs | 4 +-- src/expr/projection.rs | 6 ++-- src/expr/repartition.rs | 6 ++-- src/expr/sort.rs | 6 ++-- src/expr/subquery.rs | 6 ++-- src/expr/subquery_alias.rs | 6 ++-- src/expr/table_scan.rs | 6 ++-- src/expr/union.rs | 6 ++-- src/expr/unnest.rs | 6 ++-- src/expr/window.rs | 6 ++-- src/lib.rs | 10 +++--- src/physical_plan.rs | 2 +- src/pyarrow_filter_expression.rs | 36 ++++++++++--------- src/pyarrow_util.rs | 4 +-- src/sql/logical.rs | 4 +-- src/substrait.rs | 4 +-- src/udaf.rs | 5 +-- src/udf.rs | 5 +-- src/udwf.rs | 44 +++++++++++------------ 41 files changed, 188 insertions(+), 180 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c93d4c06f..c1d9ac838 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -71,7 +71,7 @@ jobs: - name: Run Clippy if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - run: cargo clippy --all-targets --all-features -- -D clippy::all -A clippy::redundant_closure + run: cargo clippy --all-targets --all-features -- -D clippy::all -D warnings -A clippy::redundant_closure - name: Install dependencies and build uses: astral-sh/setup-uv@v5 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e20fedf5c..b548ff18f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - id: rust-clippy name: Rust clippy description: Run cargo clippy on files included in the commit. clippy should be installed before-hand. - entry: cargo clippy --all-targets --all-features -- -Dclippy::all -Aclippy::redundant_closure + entry: cargo clippy --all-targets --all-features -- -Dclippy::all -D warnings -Aclippy::redundant_closure pass_filenames: false types: [file, rust] language: system diff --git a/src/config.rs b/src/config.rs index cc725b9a3..667d5c590 100644 --- a/src/config.rs +++ b/src/config.rs @@ -47,14 +47,14 @@ impl PyConfig { } /// Get a configuration option - pub fn get(&mut self, key: &str, py: Python) -> PyResult { + pub fn get<'py>(&mut self, key: &str, py: Python<'py>) -> PyResult> { let options = self.config.to_owned(); for entry in options.entries() { if entry.key == key { - return Ok(entry.value.into_py(py)); + return Ok(entry.value.into_pyobject(py)?); } } - Ok(None::.into_py(py)) + Ok(None::.into_pyobject(py)?) } /// Set a configuration option @@ -66,10 +66,10 @@ impl PyConfig { /// Get all configuration options pub fn get_all(&mut self, py: Python) -> PyResult { - let dict = PyDict::new_bound(py); + let dict = PyDict::new(py); let options = self.config.to_owned(); for entry in options.entries() { - dict.set_item(entry.key, entry.value.clone().into_py(py))?; + dict.set_item(entry.key, entry.value.clone().into_pyobject(py)?)?; } Ok(dict.into()) } diff --git a/src/context.rs b/src/context.rs index ebe7db230..0f962638e 100644 --- a/src/context.rs +++ b/src/context.rs @@ -458,8 +458,8 @@ impl PySessionContext { let py = data.py(); // Instantiate pyarrow Table object & convert to Arrow Table - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[data]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[data])?; let table = table_class.call_method1("from_pylist", args)?; // Convert Arrow Table to datafusion DataFrame @@ -478,8 +478,8 @@ impl PySessionContext { let py = data.py(); // Instantiate pyarrow Table object & convert to Arrow Table - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[data]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[data])?; let table = table_class.call_method1("from_pydict", args)?; // Convert Arrow Table to datafusion DataFrame @@ -533,8 +533,8 @@ impl PySessionContext { let py = data.py(); // Instantiate pyarrow Table object & convert to Arrow Table - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[data]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[data])?; let table = table_class.call_method1("from_pandas", args)?; // Convert Arrow Table to datafusion DataFrame diff --git a/src/dataframe.rs b/src/dataframe.rs index 13d7ae838..ed9578a71 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -545,12 +545,12 @@ impl PyDataFrame { /// Convert to Arrow Table /// Collect the batches and pass to Arrow Table fn to_arrow_table(&self, py: Python<'_>) -> PyResult { - let batches = self.collect(py)?.to_object(py); - let schema: PyObject = self.schema().into_pyobject(py)?.to_object(py); + let batches = self.collect(py)?.into_pyobject(py)?; + let schema = self.schema().into_pyobject(py)?; // Instantiate pyarrow Table object and use its from_batches method - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[batches, schema]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[batches, schema])?; let table: PyObject = table_class.call_method1("from_batches", args)?.into(); Ok(table) } @@ -585,8 +585,7 @@ impl PyDataFrame { let ffi_stream = FFI_ArrowArrayStream::new(reader); let stream_capsule_name = CString::new("arrow_array_stream").unwrap(); - PyCapsule::new_bound(py, ffi_stream, Some(stream_capsule_name)) - .map_err(PyDataFusionError::from) + PyCapsule::new(py, ffi_stream, Some(stream_capsule_name)).map_err(PyDataFusionError::from) } fn execute_stream(&self, py: Python) -> PyDataFusionResult { @@ -649,8 +648,8 @@ impl PyDataFrame { /// Collect the batches, pass to Arrow Table & then convert to polars DataFrame fn to_polars(&self, py: Python<'_>) -> PyResult { let table = self.to_arrow_table(py)?; - let dataframe = py.import_bound("polars")?.getattr("DataFrame")?; - let args = PyTuple::new_bound(py, &[table]); + let dataframe = py.import("polars")?.getattr("DataFrame")?; + let args = PyTuple::new(py, &[table])?; let result: PyObject = dataframe.call1(args)?.into(); Ok(result) } @@ -673,7 +672,7 @@ fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> { // Import the Python 'builtins' module to access the print function // Note that println! does not print to the Python debug console and is not visible in notebooks for instance - let print = py.import_bound("builtins")?.getattr("print")?; + let print = py.import("builtins")?.getattr("print")?; print.call1((result,))?; Ok(()) } diff --git a/src/dataset.rs b/src/dataset.rs index a8fa21ec5..0baf4da2a 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -48,7 +48,7 @@ impl Dataset { // Creates a Python PyArrow.Dataset pub fn new(dataset: &Bound<'_, PyAny>, py: Python) -> PyResult { // Ensure that we were passed an instance of pyarrow.dataset.Dataset - let ds = PyModule::import_bound(py, "pyarrow.dataset")?; + let ds = PyModule::import(py, "pyarrow.dataset")?; let ds_attr = ds.getattr("Dataset")?; let ds_type = ds_attr.downcast::()?; if dataset.is_instance(ds_type)? { diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index ace42115b..445e4fe74 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -104,7 +104,7 @@ impl DatasetExec { }) .transpose()?; - let kwargs = PyDict::new_bound(py); + let kwargs = PyDict::new(py); kwargs.set_item("columns", columns.clone())?; kwargs.set_item( @@ -121,7 +121,7 @@ impl DatasetExec { .0, ); - let builtins = Python::import_bound(py, "builtins")?; + let builtins = Python::import(py, "builtins")?; let pylist = builtins.getattr("list")?; // Get the fragments or partitions of the dataset @@ -198,7 +198,7 @@ impl ExecutionPlan for DatasetExec { let dataset_schema = dataset .getattr("schema") .map_err(|err| InnerDataFusionError::External(Box::new(err)))?; - let kwargs = PyDict::new_bound(py); + let kwargs = PyDict::new(py); kwargs .set_item("columns", self.columns.clone()) .map_err(|err| InnerDataFusionError::External(Box::new(err)))?; @@ -223,7 +223,7 @@ impl ExecutionPlan for DatasetExec { let record_batches: Bound<'_, PyIterator> = scanner .call_method0("to_batches") .map_err(|err| InnerDataFusionError::External(Box::new(err)))? - .iter() + .try_iter() .map_err(|err| InnerDataFusionError::External(Box::new(err)))?; let record_batches = PyArrowBatchesAdapter { diff --git a/src/errors.rs b/src/errors.rs index b02b754a2..f1d5aeb23 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -91,3 +91,7 @@ pub fn py_datafusion_err(e: impl Debug) -> PyErr { pub fn py_unsupported_variant_err(e: impl Debug) -> PyErr { PyErr::new::(format!("{e:?}")) } + +pub fn to_datafusion_err(e: impl Debug) -> InnerDataFusionError { + InnerDataFusionError::Execution(format!("{e:?}")) +} diff --git a/src/expr.rs b/src/expr.rs index 1e9983d42..e750be6a4 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -19,6 +19,7 @@ use datafusion::logical_expr::utils::exprlist_to_fields; use datafusion::logical_expr::{ ExprFuncBuilder, ExprFunctionExt, LogicalPlan, WindowFunctionDefinition, }; +use pyo3::IntoPyObjectExt; use pyo3::{basic::CompareOp, prelude::*}; use std::convert::{From, Into}; use std::sync::Arc; @@ -126,35 +127,35 @@ pub fn py_expr_list(expr: &[Expr]) -> PyResult> { #[pymethods] impl PyExpr { /// Return the specific expression - fn to_variant(&self, py: Python) -> PyResult { + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { Python::with_gil(|_| { match &self.expr { - Expr::Alias(alias) => Ok(PyAlias::from(alias.clone()).into_py(py)), - Expr::Column(col) => Ok(PyColumn::from(col.clone()).into_py(py)), + Expr::Alias(alias) => Ok(PyAlias::from(alias.clone()).into_bound_py_any(py)?), + Expr::Column(col) => Ok(PyColumn::from(col.clone()).into_bound_py_any(py)?), Expr::ScalarVariable(data_type, variables) => { - Ok(PyScalarVariable::new(data_type, variables).into_py(py)) + Ok(PyScalarVariable::new(data_type, variables).into_bound_py_any(py)?) } - Expr::Like(value) => Ok(PyLike::from(value.clone()).into_py(py)), - Expr::Literal(value) => Ok(PyLiteral::from(value.clone()).into_py(py)), - Expr::BinaryExpr(expr) => Ok(PyBinaryExpr::from(expr.clone()).into_py(py)), - Expr::Not(expr) => Ok(PyNot::new(*expr.clone()).into_py(py)), - Expr::IsNotNull(expr) => Ok(PyIsNotNull::new(*expr.clone()).into_py(py)), - Expr::IsNull(expr) => Ok(PyIsNull::new(*expr.clone()).into_py(py)), - Expr::IsTrue(expr) => Ok(PyIsTrue::new(*expr.clone()).into_py(py)), - Expr::IsFalse(expr) => Ok(PyIsFalse::new(*expr.clone()).into_py(py)), - Expr::IsUnknown(expr) => Ok(PyIsUnknown::new(*expr.clone()).into_py(py)), - Expr::IsNotTrue(expr) => Ok(PyIsNotTrue::new(*expr.clone()).into_py(py)), - Expr::IsNotFalse(expr) => Ok(PyIsNotFalse::new(*expr.clone()).into_py(py)), - Expr::IsNotUnknown(expr) => Ok(PyIsNotUnknown::new(*expr.clone()).into_py(py)), - Expr::Negative(expr) => Ok(PyNegative::new(*expr.clone()).into_py(py)), + Expr::Like(value) => Ok(PyLike::from(value.clone()).into_bound_py_any(py)?), + Expr::Literal(value) => Ok(PyLiteral::from(value.clone()).into_bound_py_any(py)?), + Expr::BinaryExpr(expr) => Ok(PyBinaryExpr::from(expr.clone()).into_bound_py_any(py)?), + Expr::Not(expr) => Ok(PyNot::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotNull(expr) => Ok(PyIsNotNull::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNull(expr) => Ok(PyIsNull::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsTrue(expr) => Ok(PyIsTrue::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsFalse(expr) => Ok(PyIsFalse::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsUnknown(expr) => Ok(PyIsUnknown::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotTrue(expr) => Ok(PyIsNotTrue::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotFalse(expr) => Ok(PyIsNotFalse::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotUnknown(expr) => Ok(PyIsNotUnknown::new(*expr.clone()).into_bound_py_any(py)?), + Expr::Negative(expr) => Ok(PyNegative::new(*expr.clone()).into_bound_py_any(py)?), Expr::AggregateFunction(expr) => { - Ok(PyAggregateFunction::from(expr.clone()).into_py(py)) + Ok(PyAggregateFunction::from(expr.clone()).into_bound_py_any(py)?) } - Expr::SimilarTo(value) => Ok(PySimilarTo::from(value.clone()).into_py(py)), - Expr::Between(value) => Ok(between::PyBetween::from(value.clone()).into_py(py)), - Expr::Case(value) => Ok(case::PyCase::from(value.clone()).into_py(py)), - Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_py(py)), - Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_py(py)), + Expr::SimilarTo(value) => Ok(PySimilarTo::from(value.clone()).into_bound_py_any(py)?), + Expr::Between(value) => Ok(between::PyBetween::from(value.clone()).into_bound_py_any(py)?), + Expr::Case(value) => Ok(case::PyCase::from(value.clone()).into_bound_py_any(py)?), + Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_bound_py_any(py)?), + Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_bound_py_any(py)?), Expr::ScalarFunction(value) => Err(py_unsupported_variant_err(format!( "Converting Expr::ScalarFunction to a Python object is not implemented: {:?}", value @@ -163,29 +164,29 @@ impl PyExpr { "Converting Expr::WindowFunction to a Python object is not implemented: {:?}", value ))), - Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_py(py)), - Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_py(py)), + Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_bound_py_any(py)?), + Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_bound_py_any(py)?), Expr::InSubquery(value) => { - Ok(in_subquery::PyInSubquery::from(value.clone()).into_py(py)) + Ok(in_subquery::PyInSubquery::from(value.clone()).into_bound_py_any(py)?) } Expr::ScalarSubquery(value) => { - Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_py(py)) + Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_bound_py_any(py)?) } Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", qualifier, options ))), Expr::GroupingSet(value) => { - Ok(grouping_set::PyGroupingSet::from(value.clone()).into_py(py)) + Ok(grouping_set::PyGroupingSet::from(value.clone()).into_bound_py_any(py)?) } Expr::Placeholder(value) => { - Ok(placeholder::PyPlaceholder::from(value.clone()).into_py(py)) + Ok(placeholder::PyPlaceholder::from(value.clone()).into_bound_py_any(py)?) } Expr::OuterReferenceColumn(data_type, column) => Err(py_unsupported_variant_err(format!( "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {:?} - {:?}", data_type, column ))), - Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_py(py)), + Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_bound_py_any(py)?), } }) } diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index 389bfb332..8fc9da5b0 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -19,7 +19,7 @@ use datafusion::common::DataFusionError; use datafusion::logical_expr::expr::{AggregateFunction, Alias}; use datafusion::logical_expr::logical_plan::Aggregate; use datafusion::logical_expr::Expr; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use super::logical_node::LogicalNode; @@ -151,7 +151,7 @@ impl LogicalNode for PyAggregate { vec![PyLogicalPlan::from((*self.aggregate.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/analyze.rs b/src/expr/analyze.rs index 084513971..62f93cd26 100644 --- a/src/expr/analyze.rs +++ b/src/expr/analyze.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Analyze; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use super::logical_node::LogicalNode; @@ -78,7 +78,7 @@ impl LogicalNode for PyAnalyze { vec![PyLogicalPlan::from((*self.analyze.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/create_memory_table.rs b/src/expr/create_memory_table.rs index 01ebb66b0..8872b2d47 100644 --- a/src/expr/create_memory_table.rs +++ b/src/expr/create_memory_table.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::CreateMemoryTable; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -91,7 +91,7 @@ impl LogicalNode for PyCreateMemoryTable { vec![PyLogicalPlan::from((*self.create.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/create_view.rs b/src/expr/create_view.rs index d119f5c21..87bb76876 100644 --- a/src/expr/create_view.rs +++ b/src/expr/create_view.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{CreateView, DdlStatement, LogicalPlan}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; @@ -88,8 +88,8 @@ impl LogicalNode for PyCreateView { vec![PyLogicalPlan::from((*self.create.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/distinct.rs b/src/expr/distinct.rs index 061ab4824..b62b776f8 100644 --- a/src/expr/distinct.rs +++ b/src/expr/distinct.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::Distinct; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -89,7 +89,7 @@ impl LogicalNode for PyDistinct { } } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/drop_table.rs b/src/expr/drop_table.rs index 330156abe..96983c1cf 100644 --- a/src/expr/drop_table.rs +++ b/src/expr/drop_table.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::logical_plan::DropTable; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -83,7 +83,7 @@ impl LogicalNode for PyDropTable { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/empty_relation.rs b/src/expr/empty_relation.rs index ce7163466..a1534ac15 100644 --- a/src/expr/empty_relation.rs +++ b/src/expr/empty_relation.rs @@ -17,7 +17,7 @@ use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; use datafusion::logical_expr::EmptyRelation; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use super::logical_node::LogicalNode; @@ -79,7 +79,7 @@ impl LogicalNode for PyEmptyRelation { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/explain.rs b/src/expr/explain.rs index 8e7fb8843..fc02fe2b5 100644 --- a/src/expr/explain.rs +++ b/src/expr/explain.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{logical_plan::Explain, LogicalPlan}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{common::df_schema::PyDFSchema, errors::py_type_err, sql::logical::PyLogicalPlan}; @@ -104,7 +104,7 @@ impl LogicalNode for PyExplain { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/extension.rs b/src/expr/extension.rs index a29802b0b..1e3fbb199 100644 --- a/src/expr/extension.rs +++ b/src/expr/extension.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::Extension; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -46,7 +46,7 @@ impl LogicalNode for PyExtension { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/filter.rs b/src/expr/filter.rs index a6d8aa7ee..9bdb667cd 100644 --- a/src/expr/filter.rs +++ b/src/expr/filter.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Filter; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -81,7 +81,7 @@ impl LogicalNode for PyFilter { vec![PyLogicalPlan::from((*self.filter.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/join.rs b/src/expr/join.rs index 66e677f8a..76ec532e7 100644 --- a/src/expr/join.rs +++ b/src/expr/join.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::{Join, JoinConstraint, JoinType}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -193,7 +193,7 @@ impl LogicalNode for PyJoin { ] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/limit.rs b/src/expr/limit.rs index 84ad7d68b..c2a33ff89 100644 --- a/src/expr/limit.rs +++ b/src/expr/limit.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Limit; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -90,7 +90,7 @@ impl LogicalNode for PyLimit { vec![PyLogicalPlan::from((*self.limit.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/literal.rs b/src/expr/literal.rs index 2cb2079f1..a660ac914 100644 --- a/src/expr/literal.rs +++ b/src/expr/literal.rs @@ -17,7 +17,7 @@ use crate::errors::PyDataFusionError; use datafusion::common::ScalarValue; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; #[pyclass(name = "Literal", module = "datafusion.expr", subclass)] #[derive(Clone)] @@ -144,8 +144,8 @@ impl PyLiteral { } #[allow(clippy::wrong_self_convention)] - fn into_type(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn into_type<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } fn __repr__(&self) -> PyResult { diff --git a/src/expr/logical_node.rs b/src/expr/logical_node.rs index 757e4f94b..5aff70059 100644 --- a/src/expr/logical_node.rs +++ b/src/expr/logical_node.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use pyo3::{PyObject, PyResult, Python}; +use pyo3::{Bound, PyAny, PyResult, Python}; use crate::sql::logical::PyLogicalPlan; @@ -25,5 +25,5 @@ pub trait LogicalNode { /// The input plan to the current logical node instance. fn inputs(&self) -> Vec; - fn to_variant(&self, py: Python) -> PyResult; + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult>; } diff --git a/src/expr/projection.rs b/src/expr/projection.rs index 36534fdb2..dc7e5e3c1 100644 --- a/src/expr/projection.rs +++ b/src/expr/projection.rs @@ -17,7 +17,7 @@ use datafusion::logical_expr::logical_plan::Projection; use datafusion::logical_expr::Expr; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -113,7 +113,7 @@ impl LogicalNode for PyProjection { vec![PyLogicalPlan::from((*self.projection.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/repartition.rs b/src/expr/repartition.rs index 4e680e181..3e782d6af 100644 --- a/src/expr/repartition.rs +++ b/src/expr/repartition.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{logical_plan::Repartition, Expr, Partitioning}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; @@ -121,7 +121,7 @@ impl LogicalNode for PyRepartition { vec![PyLogicalPlan::from((*self.repartition.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/sort.rs b/src/expr/sort.rs index a1803ccaf..ed4947591 100644 --- a/src/expr/sort.rs +++ b/src/expr/sort.rs @@ -17,7 +17,7 @@ use datafusion::common::DataFusionError; use datafusion::logical_expr::logical_plan::Sort; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -96,7 +96,7 @@ impl LogicalNode for PySort { vec![PyLogicalPlan::from((*self.sort.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/subquery.rs b/src/expr/subquery.rs index dac8d0a2b..5ebfe6927 100644 --- a/src/expr/subquery.rs +++ b/src/expr/subquery.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::Subquery; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -75,7 +75,7 @@ impl LogicalNode for PySubquery { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/subquery_alias.rs b/src/expr/subquery_alias.rs index a83cff96d..267a4d485 100644 --- a/src/expr/subquery_alias.rs +++ b/src/expr/subquery_alias.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::SubqueryAlias; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; @@ -85,7 +85,7 @@ impl LogicalNode for PySubqueryAlias { vec![PyLogicalPlan::from((*self.subquery_alias.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/table_scan.rs b/src/expr/table_scan.rs index f61be7fe4..6a0d53f0f 100644 --- a/src/expr/table_scan.rs +++ b/src/expr/table_scan.rs @@ -17,7 +17,7 @@ use datafusion::common::TableReference; use datafusion::logical_expr::logical_plan::TableScan; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::expr::logical_node::LogicalNode; @@ -146,7 +146,7 @@ impl LogicalNode for PyTableScan { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/union.rs b/src/expr/union.rs index 62488d9a1..5a08ccc13 100644 --- a/src/expr/union.rs +++ b/src/expr/union.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Union; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -83,7 +83,7 @@ impl LogicalNode for PyUnion { .collect() } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/unnest.rs b/src/expr/unnest.rs index adc705035..8e70e0990 100644 --- a/src/expr/unnest.rs +++ b/src/expr/unnest.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Unnest; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -79,7 +79,7 @@ impl LogicalNode for PyUnnest { vec![PyLogicalPlan::from((*self.unnest_.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/window.rs b/src/expr/window.rs index 4dc6cb9c9..13deaec25 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -18,7 +18,7 @@ use datafusion::common::{DataFusionError, ScalarValue}; use datafusion::logical_expr::expr::WindowFunction; use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, WindowFrameUnits}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::data_type::PyScalarValue; @@ -289,7 +289,7 @@ impl LogicalNode for PyWindowExpr { vec![self.window.input.as_ref().clone().into()] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/lib.rs b/src/lib.rs index 317c3a49a..ce93ff0c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -94,21 +94,21 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; // Register `common` as a submodule. Matching `datafusion-common` https://docs.rs/datafusion-common/latest/datafusion_common/ - let common = PyModule::new_bound(py, "common")?; + let common = PyModule::new(py, "common")?; common::init_module(&common)?; m.add_submodule(&common)?; // Register `expr` as a submodule. Matching `datafusion-expr` https://docs.rs/datafusion-expr/latest/datafusion_expr/ - let expr = PyModule::new_bound(py, "expr")?; + let expr = PyModule::new(py, "expr")?; expr::init_module(&expr)?; m.add_submodule(&expr)?; // Register the functions as a submodule - let funcs = PyModule::new_bound(py, "functions")?; + let funcs = PyModule::new(py, "functions")?; functions::init_module(&funcs)?; m.add_submodule(&funcs)?; - let store = PyModule::new_bound(py, "object_store")?; + let store = PyModule::new(py, "object_store")?; store::init_module(&store)?; m.add_submodule(&store)?; @@ -121,7 +121,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { #[cfg(feature = "substrait")] fn setup_substrait_module(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { - let substrait = PyModule::new_bound(py, "substrait")?; + let substrait = PyModule::new(py, "substrait")?; substrait::init_module(&substrait)?; m.add_submodule(&substrait)?; Ok(()) diff --git a/src/physical_plan.rs b/src/physical_plan.rs index 295908dc7..f0be45c6a 100644 --- a/src/physical_plan.rs +++ b/src/physical_plan.rs @@ -66,7 +66,7 @@ impl PyExecutionPlan { )?; let bytes = proto.encode_to_vec(); - Ok(PyBytes::new_bound(py, &bytes)) + Ok(PyBytes::new(py, &bytes)) } #[staticmethod] diff --git a/src/pyarrow_filter_expression.rs b/src/pyarrow_filter_expression.rs index 314eebf4f..4b4c86597 100644 --- a/src/pyarrow_filter_expression.rs +++ b/src/pyarrow_filter_expression.rs @@ -16,7 +16,7 @@ // under the License. /// Converts a Datafusion logical plan expression (Expr) into a PyArrow compute expression -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::convert::TryFrom; use std::result::Result; @@ -53,24 +53,28 @@ fn operator_to_py<'py>( Ok(py_op) } -fn extract_scalar_list(exprs: &[Expr], py: Python) -> PyDataFusionResult> { +fn extract_scalar_list<'py>( + exprs: &[Expr], + py: Python<'py>, +) -> PyDataFusionResult>> { let ret = exprs .iter() .map(|expr| match expr { // TODO: should we also leverage `ScalarValue::to_pyarrow` here? Expr::Literal(v) => match v { - ScalarValue::Boolean(Some(b)) => Ok(b.into_py(py)), - ScalarValue::Int8(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Int16(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Int32(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Int64(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt8(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt16(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt32(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt64(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Float32(Some(f)) => Ok(f.into_py(py)), - ScalarValue::Float64(Some(f)) => Ok(f.into_py(py)), - ScalarValue::Utf8(Some(s)) => Ok(s.into_py(py)), + // The unwraps here are for infallible conversions + ScalarValue::Boolean(Some(b)) => Ok(b.into_bound_py_any(py)?), + ScalarValue::Int8(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Int16(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Int32(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Int64(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt8(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt16(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt32(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt64(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Float32(Some(f)) => Ok(f.into_bound_py_any(py)?), + ScalarValue::Float64(Some(f)) => Ok(f.into_bound_py_any(py)?), + ScalarValue::Utf8(Some(s)) => Ok(s.into_bound_py_any(py)?), _ => Err(PyDataFusionError::Common(format!( "PyArrow can't handle ScalarValue: {v:?}" ))), @@ -98,8 +102,8 @@ impl TryFrom<&Expr> for PyArrowFilterExpression { // https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Expression.html#pyarrow-dataset-expression fn try_from(expr: &Expr) -> Result { Python::with_gil(|py| { - let pc = Python::import_bound(py, "pyarrow.compute")?; - let op_module = Python::import_bound(py, "operator")?; + let pc = Python::import(py, "pyarrow.compute")?; + let op_module = Python::import(py, "operator")?; let pc_expr: PyDataFusionResult> = match expr { Expr::Column(Column { name, .. }) => Ok(pc.getattr("field")?.call1((name,))?), Expr::Literal(scalar) => Ok(scalar_to_pyarrow(scalar, py)?.into_bound(py)), diff --git a/src/pyarrow_util.rs b/src/pyarrow_util.rs index 2b31467f8..cab708458 100644 --- a/src/pyarrow_util.rs +++ b/src/pyarrow_util.rs @@ -33,8 +33,8 @@ impl FromPyArrow for PyScalarValue { let val = value.call_method0("as_py")?; // construct pyarrow array from the python value and pyarrow type - let factory = py.import_bound("pyarrow")?.getattr("array")?; - let args = PyList::new_bound(py, [val]); + let factory = py.import("pyarrow")?.getattr("array")?; + let args = PyList::new(py, [val])?; let array = factory.call1((args, typ))?; // convert the pyarrow array to rust array using C data interface diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 1be33b75f..96561c434 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -64,7 +64,7 @@ impl PyLogicalPlan { #[pymethods] impl PyLogicalPlan { /// Return the specific logical operator - pub fn to_variant(&self, py: Python) -> PyResult { + pub fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { match self.plan.as_ref() { LogicalPlan::Aggregate(plan) => PyAggregate::from(plan.clone()).to_variant(py), LogicalPlan::Analyze(plan) => PyAnalyze::from(plan.clone()).to_variant(py), @@ -132,7 +132,7 @@ impl PyLogicalPlan { datafusion_proto::protobuf::LogicalPlanNode::try_from_logical_plan(&self.plan, &codec)?; let bytes = proto.encode_to_vec(); - Ok(PyBytes::new_bound(py, &bytes)) + Ok(PyBytes::new(py, &bytes)) } #[staticmethod] diff --git a/src/substrait.rs b/src/substrait.rs index 8dcf3e8a7..1fefc0bbd 100644 --- a/src/substrait.rs +++ b/src/substrait.rs @@ -40,7 +40,7 @@ impl PyPlan { self.plan .encode(&mut proto_bytes) .map_err(PyDataFusionError::EncodeError)?; - Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) + Ok(PyBytes::new(py, &proto_bytes).into()) } } @@ -95,7 +95,7 @@ impl PySubstraitSerializer { py: Python, ) -> PyDataFusionResult { let proto_bytes: Vec = wait_for_future(py, serializer::serialize_bytes(sql, &ctx.ctx))?; - Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) + Ok(PyBytes::new(py, &proto_bytes).into()) } #[staticmethod] diff --git a/src/udaf.rs b/src/udaf.rs index 5f21533e0..34a9cd51d 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -29,6 +29,7 @@ use datafusion::logical_expr::{ }; use crate::common::data_type::PyScalarValue; +use crate::errors::to_datafusion_err; use crate::expr::PyExpr; use crate::utils::parse_volatility; @@ -73,7 +74,7 @@ impl Accumulator for RustAccumulator { .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) .collect::>(); - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; // 2. call function self.accum @@ -119,7 +120,7 @@ impl Accumulator for RustAccumulator { .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) .collect::>(); - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; // 2. call function self.accum diff --git a/src/udf.rs b/src/udf.rs index 4570e77a6..574c9d7b5 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -28,6 +28,7 @@ use datafusion::logical_expr::function::ScalarFunctionImplementation; use datafusion::logical_expr::ScalarUDF; use datafusion::logical_expr::{create_udf, ColumnarValue}; +use crate::errors::to_datafusion_err; use crate::expr::PyExpr; use crate::utils::parse_volatility; @@ -46,11 +47,11 @@ fn pyarrow_function_to_rust( .map_err(|e| DataFusionError::Execution(format!("{e:?}"))) }) .collect::, _>>()?; - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; // 2. call function let value = func - .call_bound(py, py_args, None) + .call(py, py_args, None) .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; // 3. cast to arrow::array::Array diff --git a/src/udwf.rs b/src/udwf.rs index 04a4a1640..defd9c522 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -27,6 +27,7 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use crate::common::data_type::PyScalarValue; +use crate::errors::to_datafusion_err; use crate::expr::PyExpr; use crate::utils::parse_volatility; use datafusion::arrow::datatypes::DataType; @@ -56,8 +57,8 @@ impl PartitionEvaluator for RustPartitionEvaluator { fn get_range(&self, idx: usize, n_rows: usize) -> Result> { Python::with_gil(|py| { - let py_args = vec![idx.to_object(py), n_rows.to_object(py)]; - let py_args = PyTuple::new_bound(py, py_args); + let py_args = vec![idx.into_pyobject(py)?, n_rows.into_pyobject(py)?]; + let py_args = PyTuple::new(py, py_args)?; self.evaluator .bind(py) @@ -93,17 +94,14 @@ impl PartitionEvaluator for RustPartitionEvaluator { fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result { println!("evaluate all called with number of values {}", values.len()); Python::with_gil(|py| { - let py_values = PyList::new_bound( + let py_values = PyList::new( py, values .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()), - ); - let py_num_rows = num_rows.to_object(py).into_bound(py); - let py_args = PyTuple::new_bound( - py, - PyTuple::new_bound(py, vec![py_values.as_any(), &py_num_rows]), - ); + )?; + let py_num_rows = num_rows.into_pyobject(py)?; + let py_args = PyTuple::new(py, vec![py_values.as_any(), &py_num_rows])?; self.evaluator .bind(py) @@ -112,32 +110,28 @@ impl PartitionEvaluator for RustPartitionEvaluator { let array_data = ArrayData::from_pyarrow_bound(&v).unwrap(); make_array(array_data) }) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) }) + .map_err(to_datafusion_err) } fn evaluate(&mut self, values: &[ArrayRef], range: &Range) -> Result { Python::with_gil(|py| { - let py_values = PyList::new_bound( + let py_values = PyList::new( py, values .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()), - ); - let range_tuple = - PyTuple::new_bound(py, vec![range.start.to_object(py), range.end.to_object(py)]); - let py_args = PyTuple::new_bound( - py, - PyTuple::new_bound(py, vec![py_values.as_any(), range_tuple.as_any()]), - ); + )?; + let range_tuple = PyTuple::new(py, vec![range.start, range.end])?; + let py_args = PyTuple::new(py, vec![py_values.as_any(), range_tuple.as_any()])?; self.evaluator .bind(py) .call_method1("evaluate", py_args) .and_then(|v| v.extract::()) .map(|v| v.0) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) }) + .map_err(to_datafusion_err) } fn evaluate_all_with_rank( @@ -148,23 +142,27 @@ impl PartitionEvaluator for RustPartitionEvaluator { Python::with_gil(|py| { let ranks = ranks_in_partition .iter() - .map(|r| PyTuple::new_bound(py, vec![r.start, r.end])); + .map(|r| PyTuple::new(py, vec![r.start, r.end])) + .collect::>>()?; // 1. cast args to Pyarrow array - let py_args = vec![num_rows.to_object(py), PyList::new_bound(py, ranks).into()]; + let py_args = vec![ + num_rows.into_pyobject(py)?.into_any(), + PyList::new(py, ranks)?.into_any(), + ]; - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args)?; // 2. call function self.evaluator .bind(py) .call_method1("evaluate_all_with_rank", py_args) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) .map(|v| { let array_data = ArrayData::from_pyarrow_bound(&v).unwrap(); make_array(array_data) }) }) + .map_err(to_datafusion_err) } fn supports_bounded_execution(&self) -> bool { From e6f6e66c1d180246ad933f8bcc0d40faa8426dfa Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 21 Feb 2025 16:03:36 -0500 Subject: [PATCH 099/248] Add user documentation for the FFI approach (#1031) * Initial commit for FFI user documentation * Update readme to point to the online documentation. Fix a small typo. * Small text adjustments for clarity and formatting --- README.md | 11 +- docs/source/contributor-guide/ffi.rst | 212 ++++++++++++++++++++++++++ docs/source/index.rst | 1 + 3 files changed, 220 insertions(+), 4 deletions(-) create mode 100644 docs/source/contributor-guide/ffi.rst diff --git a/README.md b/README.md index 5aaf7f5f3..9c56b62dd 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,8 @@ DataFusion's Python bindings can be used as a foundation for building new data s planning, and logical plan optimizations, and then transpiles the logical plan to Dask operations for execution. - [DataFusion Ballista](https://github.com/apache/datafusion-ballista) is a distributed SQL query engine that extends DataFusion's Python bindings for distributed use cases. - -It is also possible to use these Python bindings directly for DataFrame and SQL operations, but you may find that -[Polars](http://pola.rs/) and [DuckDB](http://www.duckdb.org/) are more suitable for this use case, since they have -more of an end-user focus and are more actively maintained than these Python bindings. +- [DataFusion Ray](https://github.com/apache/datafusion-ray) is another distributed query engine that uses + DataFusion's Python bindings. ## Features @@ -114,6 +112,11 @@ Printing the context will show the current configuration settings. print(ctx) ``` +## Extensions + +For information about how to extend DataFusion Python, please see the extensions page of the +[online documentation](https://datafusion.apache.org/python/). + ## More Examples See [examples](examples/README.md) for more information. diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst new file mode 100644 index 000000000..c1f9806b3 --- /dev/null +++ b/docs/source/contributor-guide/ffi.rst @@ -0,0 +1,212 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Python Extensions +================= + +The DataFusion in Python project is designed to allow users to extend its functionality in a few core +areas. Ideally many users would like to package their extensions as a Python package and easily +integrate that package with this project. This page serves to describe some of the challenges we face +when doing these integrations and the approach our project uses. + +The Primary Issue +----------------- + +Suppose you wish to use DataFusion and you have a custom data source that can produce tables that +can then be queried against, similar to how you can register a :ref:`CSV ` or +:ref:`Parquet ` file. In DataFusion terminology, you likely want to implement a +:ref:`Custom Table Provider `. In an effort to make your data source +as performant as possible and to utilize the features of DataFusion, you may decide to write +your source in Rust and then expose it through `PyO3 `_ as a Python library. + +At first glance, it may appear the best way to do this is to add the ``datafusion-python`` +crate as a dependency, provide a ``PyTable``, and then to register it with the +``SessionContext``. Unfortunately, this will not work. + +When you produce your code as a Python library and it needs to interact with the DataFusion +library, at the lowest level they communicate through an Application Binary Interface (ABI). +The acronym sounds similar to API (Application Programming Interface), but it is distinctly +different. + +The ABI sets the standard for how these libraries can share data and functions between each +other. One of the key differences between Rust and other programming languages is that Rust +does not have a stable ABI. What this means in practice is that if you compile a Rust library +with one version of the ``rustc`` compiler and I compile another library to interface with it +but I use a different version of the compiler, there is no guarantee the interface will be +the same. + +In practice, this means that a Python library built with ``datafusion-python`` as a Rust +dependency will generally **not** be compatible with the DataFusion Python package, even +if they reference the same version of ``datafusion-python``. If you attempt to do this, it may +work on your local computer if you have built both packages with the same optimizations. +This can sometimes lead to a false expectation that the code will work, but it frequently +breaks the moment you try to use your package against the released packages. + +You can find more information about the Rust ABI in their +`online documentation `_. + +The FFI Approach +---------------- + +Rust supports interacting with other programming languages through it's Foreign Function +Interface (FFI). The advantage of using the FFI is that it enables you to write data structures +and functions that have a stable ABI. The allows you to use Rust code with C, Python, and +other languages. In fact, the `PyO3 `_ library uses the FFI to share data +and functions between Python and Rust. + +The approach we are taking in the DataFusion in Python project is to incrementally expose +more portions of the DataFusion project via FFI interfaces. This allows users to write Rust +code that does **not** require the ``datafusion-python`` crate as a dependency, expose their +code in Python via PyO3, and have it interact with the DataFusion Python package. + +Early adopters of this approach include `delta-rs `_ +who has adapted their Table Provider for use in ```datafusion-python``` with only a few lines +of code. Also, the DataFusion Python project uses the existing definitions from +`Apache Arrow CStream Interface `_ +to support importing **and** exporting tables. Any Python package that supports reading +the Arrow C Stream interface can work with DataFusion Python out of the box! You can read +more about working with Arrow sources in the :ref:`Data Sources ` +page. + +To learn more about the Foreign Function Interface in Rust, the +`Rustonomicon `_ is a good resource. + +Inspiration from Arrow +---------------------- + +DataFusion is built upon `Apache Arrow `_. The canonical Python +Arrow implementation, `pyarrow `_ provides +an excellent way to share Arrow data between Python projects without performing any copy +operations on the data. They do this by using a well defined set of interfaces. You can +find the details about their stream interface +`here `_. The +`Rust Arrow Implementation `_ also supports these +``C`` style definitions via the Foreign Function Interface. + +In addition to using these interfaces to transfer Arrow data between libraries, ``pyarrow`` +goes one step further to make sharing the interfaces easier in Python. They do this +by exposing PyCapsules that contain the expected functionality. + +You can learn more about PyCapsules from the official +`Python online documentation `_. PyCapsules +have excellent support in PyO3 already. The +`PyO3 online documentation `_ is a good source +for more details on using PyCapsules in Rust. + +Two lessons we leverage from the Arrow project in DataFusion Python are: + +- We reuse the existing Arrow FFI functionality wherever possible. +- We expose PyCapsules that contain a FFI stable struct. + +Implementation Details +---------------------- + +The bulk of the code necessary to perform our FFI operations is in the upstream +`DataFusion `_ core repository. You can review the code and +documentation in the `datafusion-ffi`_ crate. + +Our FFI implementation is narrowly focused at sharing data and functions with Rust backed +libraries. This allows us to use the `abi_stable crate `_. +This is an excellent crate that allows for easy conversion between Rust native types +and FFI-safe alternatives. For example, if you needed to pass a ``Vec`` via FFI, +you can simply convert it to a ``RVec`` in an intuitive manner. It also supports +features like ``RResult`` and ``ROption`` that do not have an obvious translation to a +C equivalent. + +The `datafusion-ffi`_ crate has been designed to make it easy to convert from DataFusion +traits into their FFI counterparts. For example, if you have defined a custom +`TableProvider `_ +and you want to create a sharable FFI counterpart, you could write: + +.. code-block:: rust + + let my_provider = MyTableProvider::default(); + let ffi_provider = FFI_TableProvider::new(Arc::new(my_provider), false, None); + +If you were interfacing with a library that provided the above ``FFI_TableProvider`` and +you needed to turn it back into an ``TableProvider``, you can turn it into a +``ForeignTableProvider`` with implements the ``TableProvider`` trait. + +.. code-block:: rust + + let foreign_provider: ForeignTableProvider = ffi_provider.into(); + +If you review the code in `datafusion-ffi`_ you will find that each of the traits we share +across the boundary has two portions, one with a ``FFI_`` prefix and one with a ``Foreign`` +prefix. This is used to distinguish which side of the FFI boundary that struct is +designed to be used on. The structures with the ``FFI_`` prefix are to be used on the +**provider** of the structure. In the example we're showing, this means the code that has +written the underlying ``TableProvider`` implementation to access your custom data source. +The structures with the ``Foreign`` prefix are to be used by the receiver. In this case, +it is the ``datafusion-python`` library. + +In order to share these FFI structures, we need to wrap them in some kind of Python object +that can be used to interface from one package to another. As described in the above +section on our inspiration from Arrow, we use ``PyCapsule``. We can create a ``PyCapsule`` +for our provider thusly: + +.. code-block:: rust + + let name = CString::new("datafusion_table_provider")?; + let my_capsule = PyCapsule::new_bound(py, provider, Some(name))?; + +On the receiving side, turn this pycapsule object into the ``FFI_TableProvider``, which +can then be turned into a ``ForeignTableProvider`` the associated code is: + +.. code-block:: rust + + let capsule = capsule.downcast::()?; + let provider = unsafe { capsule.reference::() }; + +By convention the ``datafusion-python`` library expects a Python object that has a +``TableProvider`` PyCapsule to have this capsule accessible by calling a function named +``__datafusion_table_provider__``. You can see a complete working example of how to +share a ``TableProvider`` from one python library to DataFusion Python in the +`repository examples folder `_. + +This section has been written using ``TableProvider`` as an example. It is the first +extension that has been written using this approach and the most thoroughly implemented. +As we continue to expose more of the DataFusion features, we intend to follow this same +design pattern. + +Alternative Approach +-------------------- + +Suppose you needed to expose some other features of DataFusion and you could not wait +for the upstream repository to implement the FFI approach we describe. In this case +you decide to create your dependency on the ``datafusion-python`` crate instead. + +As we discussed, this is not guaranteed to work across different compiler versions and +optimization levels. If you wish to go down this route, there are two approaches we +have identified you can use. + +#. Re-export all of ``datafusion-python`` yourself with your extensions built in. +#. Carefully synchonize your software releases with the ``datafusion-python`` CI build + system so that your libraries use the exact same compiler, features, and + optimization level. + +We currently do not recommend either of these approaches as they are difficult to +maintain over a long period. Additionally, they require a tight version coupling +between libraries. + +Status of Work +-------------- + +At the time of this writing, the FFI features are under active development. To see +the latest status, we recommend reviewing the code in the `datafusion-ffi`_ crate. + +.. _datafusion-ffi: https://crates.io/crates/datafusion-ffi diff --git a/docs/source/index.rst b/docs/source/index.rst index 34eb23b28..558b2d572 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,6 +85,7 @@ Example :caption: CONTRIBUTOR GUIDE contributor-guide/introduction + contributor-guide/ffi .. _toc.api: .. toctree:: From 3f3983cc86ffe267cff97480241e8a588ac38fa3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 23 Feb 2025 08:00:52 -0500 Subject: [PATCH 100/248] build(deps): bump arrow from 54.1.0 to 54.2.0 (#1035) Bumps [arrow](https://github.com/apache/arrow-rs) from 54.1.0 to 54.2.0. - [Release notes](https://github.com/apache/arrow-rs/releases) - [Changelog](https://github.com/apache/arrow-rs/blob/main/CHANGELOG-old.md) - [Commits](https://github.com/apache/arrow-rs/compare/54.1.0...54.2.0) --- updated-dependencies: - dependency-name: arrow dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 56 +++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f1b1ed50a..d23ed6169 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" +checksum = "755b6da235ac356a869393c23668c663720b8749dd6f15e52b6c214b4b964cc7" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" +checksum = "64656a1e0b13ca766f8440752e9a93e11014eec7b67909986f83ed0ab1fe37b8" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" +checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" +checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" +checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" +checksum = "90f12542b8164398fc9ec595ff783c4cf6044daa89622c5a7201be920e4c0d4c" dependencies = [ "arrow-array", "arrow-cast", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" +checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83" dependencies = [ "arrow-buffer", "arrow-schema", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" +checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" +checksum = "9551d9400532f23a370cabbea1dc5a53c49230397d41f96c4c8eedf306199305" dependencies = [ "arrow-array", "arrow-buffer", @@ -326,9 +326,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" +checksum = "6c07223476f8219d1ace8cd8d85fa18c4ebd8d945013f25ef5c72e85085ca4ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -339,9 +339,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" +checksum = "91b194b38bfd89feabc23e798238989c6648b2506ad639be42ec8eb1658d82c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -352,18 +352,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" +checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" +checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539" dependencies = [ "ahash", "arrow-array", @@ -375,9 +375,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" +checksum = "d44c8eed43be4ead49128370f7131f054839d3d6003e52aebf64322470b8fbd0" dependencies = [ "arrow-array", "arrow-buffer", From 69ebf70bd821d0ae516d2f61d96058e2252a7a1f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 24 Feb 2025 21:30:52 +0100 Subject: [PATCH 101/248] Chore: Release datafusion-python 45 (#1024) * Bump version number to prepare for release * Add changelog 45.0.0 * Add deprecated marker from either typing or typing_extensions based on the python version * Limit pyarrow version per issue # 1023 * Bumping the version number to support new release candidate * There was no guarantee that the record batches would be returned in a single partition, so update the unit test to check all partitions. * Revert "Limit pyarrow version per issue # 1023" This reverts commit b48d5872661017ec21ea71f7dbb9569f2f0bf797. * Correct import for python 3.13 and above * Bump minor version due to pypi requirement * Update cargo lock --- Cargo.lock | 113 +++++++++++++-------------------- Cargo.toml | 2 +- dev/changelog/45.0.0.md | 42 ++++++++++++ python/datafusion/context.py | 5 +- python/datafusion/dataframe.py | 5 +- python/datafusion/expr.py | 6 +- python/datafusion/substrait.py | 5 +- python/tests/test_dataframe.py | 21 ++++-- 8 files changed, 118 insertions(+), 81 deletions(-) create mode 100644 dev/changelog/45.0.0.md diff --git a/Cargo.lock b/Cargo.lock index d23ed6169..5c7f2bf3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -606,19 +606,18 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bafdbf26611df8c14810e268ddceda071c297570a5fb360ceddf617fe417ef58" +checksum = "75b89e7c29231c673a61a46e722602bcd138298f6b9e81e71119693534585f5c" dependencies = [ "bzip2-sys", - "libc", ] [[package]] name = "bzip2-sys" -version = "0.1.11+1.0.8" +version = "0.1.12+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +checksum = "72ebc2f1a417f01e1da30ef264ee86ae31d2dcd2d603ea283d3c244a883ca2a9" dependencies = [ "cc", "libc", @@ -627,9 +626,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.12" +version = "1.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "755717a7de9ec452bf7f3f1a3099085deabd7f2962b861dae91ecd7a365903d2" +checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" dependencies = [ "jobserver", "libc", @@ -684,21 +683,20 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.53" +version = "0.1.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" dependencies = [ "cc", ] [[package]] name = "comfy-table" -version = "7.1.3" +version = "7.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" +checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" dependencies = [ - "strum", - "strum_macros", + "unicode-segmentation", "unicode-width", ] @@ -837,9 +835,9 @@ dependencies = [ [[package]] name = "csv-core" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" dependencies = [ "memchr", ] @@ -878,7 +876,7 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "bzip2 0.5.0", + "bzip2 0.5.1", "chrono", "datafusion-catalog", "datafusion-common", @@ -1240,7 +1238,7 @@ dependencies = [ "itertools 0.14.0", "log", "paste", - "petgraph 0.7.1", + "petgraph", ] [[package]] @@ -1341,7 +1339,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "44.0.0" +version = "45.2.0" dependencies = [ "arrow", "async-trait", @@ -1436,9 +1434,9 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "equivalent" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" @@ -1456,12 +1454,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "fixedbitset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" - [[package]] name = "fixedbitset" version = "0.5.7" @@ -2269,9 +2261,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" +checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" dependencies = [ "adler2", ] @@ -2548,23 +2540,13 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" -[[package]] -name = "petgraph" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" -dependencies = [ - "fixedbitset 0.4.2", - "indexmap", -] - [[package]] name = "petgraph" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ - "fixedbitset 0.5.7", + "fixedbitset", "indexmap", ] @@ -2660,9 +2642,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", "prost-derive", @@ -2670,16 +2652,16 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck", - "itertools 0.13.0", + "itertools 0.14.0", "log", "multimap", "once_cell", - "petgraph 0.6.5", + "petgraph", "prettyplease", "prost", "prost-types", @@ -2690,12 +2672,12 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.98", @@ -2703,9 +2685,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ "prost", ] @@ -2721,9 +2703,9 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" dependencies = [ "cc", ] @@ -2860,9 +2842,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" +checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" dependencies = [ "cfg_aliases", "libc", @@ -3042,15 +3024,14 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.8" +version = "0.17.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24" dependencies = [ "cc", "cfg-if", "getrandom 0.2.15", "libc", - "spin", "untrusted", "windows-sys 0.52.0", ] @@ -3097,9 +3078,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.22" +version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" +checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ "once_cell", "ring", @@ -3377,9 +3358,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" [[package]] name = "snafu" @@ -3418,12 +3399,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" - [[package]] name = "sqlparser" version = "0.53.0" @@ -3453,9 +3428,9 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "stacker" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b" +checksum = "1d08feb8f695b465baed819b03c128dc23f57a694510ab1f06c77f763975685e" dependencies = [ "cc", "cfg-if", diff --git a/Cargo.toml b/Cargo.toml index d18e0e8f0..5358b1836 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "44.0.0" +version = "45.2.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] diff --git a/dev/changelog/45.0.0.md b/dev/changelog/45.0.0.md new file mode 100644 index 000000000..93659b171 --- /dev/null +++ b/dev/changelog/45.0.0.md @@ -0,0 +1,42 @@ + + +# Apache DataFusion Python 45.0.0 Changelog + +This release consists of 2 commits from 2 contributors. See credits at the end of this changelog for more information. + +**Fixed bugs:** + +- fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky) + +**Other:** + +- Chore/upgrade datafusion 45 [#1010](https://github.com/apache/datafusion-python/pull/1010) (kevinjqliu) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 1 Kevin Liu + 1 Tim Saucer +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 864ef1c8b..21955b6d1 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -21,7 +21,10 @@ from typing import TYPE_CHECKING, Any, Protocol -from typing_extensions import deprecated +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.catalog import Catalog, Table from datafusion.dataframe import DataFrame diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 7413a5fa3..23b5d630b 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -33,7 +33,10 @@ overload, ) -from typing_extensions import deprecated +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.plan import ExecutionPlan, LogicalPlan from datafusion.record_batch import RecordBatchStream diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 68ddd7c9a..e3d7158eb 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -25,7 +25,11 @@ from typing import TYPE_CHECKING, Any, Optional, Type import pyarrow as pa -from typing_extensions import deprecated + +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.common import DataTypeMap, NullTreatment, RexType diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 402184d3f..06302fe38 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -26,7 +26,10 @@ import pathlib from typing import TYPE_CHECKING -from typing_extensions import deprecated +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.plan import LogicalPlan diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 5bc3fb094..c636e896a 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -755,13 +755,20 @@ def test_execution_plan(aggregate_df): assert "CsvExec:" in indent ctx = SessionContext() - stream = ctx.execute(plan, 0) - # get the one and only batch - batch = stream.next() - assert batch is not None - # there should be no more batches - with pytest.raises(StopIteration): - stream.next() + rows_returned = 0 + for idx in range(0, plan.partition_count): + stream = ctx.execute(plan, idx) + try: + batch = stream.next() + assert batch is not None + rows_returned += len(batch.to_pyarrow()[0]) + except StopIteration: + # This is one of the partitions with no values + pass + with pytest.raises(StopIteration): + stream.next() + + assert rows_returned == 5 def test_repartition(df): From a80a788f69cf46ef002b3c537837548cc103748c Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 8 Mar 2025 21:22:36 +0800 Subject: [PATCH 102/248] Enable Dataframe to be converted into views which can be used in register_table (#1016) * add test_view * feat: add into_view method to register DataFrame as a view * add pytableprovider * feat: add as_table method to PyTableProvider and update into_view to return PyTable * refactor: simplify as_table method and update documentation for into_view * test: improve test_register_filtered_dataframe by removing redundant comments and assertions * test: enhance test_register_filtered_dataframe with additional assertions for DataFrame results * ruff formatted * cleanup: remove unused imports from test_view.py * docs: add example for registering a DataFrame as a view in README.md * docs: update docstring for into_view method to clarify usage as ViewTable * chore: add license header to test_view.py * ruff correction * refactor: rename into_view method to _into_view * ruff lint * refactor: simplify into_view method and update Rust binding convention * docs: add views section to user guide with example on registering views * feat: add register_view method to SessionContext for DataFrame registration * docs: update README and user guide to reflect register_view method for DataFrame registration * docs: remove some documentation from PyDataFrame --- README.md | 40 +++++++++++++ .../user-guide/common-operations/index.rst | 1 + .../user-guide/common-operations/views.rst | 58 +++++++++++++++++++ python/datafusion/context.py | 12 ++++ python/datafusion/dataframe.py | 4 ++ python/tests/test_view.py | 49 ++++++++++++++++ src/dataframe.rs | 39 +++++++++++++ 7 files changed, 203 insertions(+) create mode 100644 docs/source/user-guide/common-operations/views.rst create mode 100644 python/tests/test_view.py diff --git a/README.md b/README.md index 9c56b62dd..4f80dbe18 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,46 @@ This produces the following chart: ![Chart](examples/chart.png) +## Registering a DataFrame as a View + +You can use SessionContext's `register_view` method to convert a DataFrame into a view and register it with the context. + +```python +from datafusion import SessionContext, col, literal + +# Create a DataFusion context +ctx = SessionContext() + +# Create sample data +data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + +# Create a DataFrame from the dictionary +df = ctx.from_pydict(data, "my_table") + +# Filter the DataFrame (for example, keep rows where a > 2) +df_filtered = df.filter(col("a") > literal(2)) + +# Register the dataframe as a view with the context +ctx.register_view("view1", df_filtered) + +# Now run a SQL query against the registered view +df_view = ctx.sql("SELECT * FROM view1") + +# Collect the results +results = df_view.collect() + +# Convert results to a list of dictionaries for display +result_dicts = [batch.to_pydict() for batch in results] + +print(result_dicts) +``` + +This will output: + +```python +[{'a': [3, 4, 5], 'b': [30, 40, 50]}] +``` + ## Configuration It is possible to configure runtime (memory and disk settings) and configuration settings when creating a context. diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst index d7c708c21..7abd1f138 100644 --- a/docs/source/user-guide/common-operations/index.rst +++ b/docs/source/user-guide/common-operations/index.rst @@ -23,6 +23,7 @@ The contents of this section are designed to guide a new user through how to use .. toctree:: :maxdepth: 2 + views basic-info select-and-filter expressions diff --git a/docs/source/user-guide/common-operations/views.rst b/docs/source/user-guide/common-operations/views.rst new file mode 100644 index 000000000..df11e3abe --- /dev/null +++ b/docs/source/user-guide/common-operations/views.rst @@ -0,0 +1,58 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +====================== +Registering Views +====================== + +You can use the context's ``register_view`` method to register a DataFrame as a view + +.. code-block:: python + + from datafusion import SessionContext, col, literal + + # Create a DataFusion context + ctx = SessionContext() + + # Create sample data + data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + + # Create a DataFrame from the dictionary + df = ctx.from_pydict(data, "my_table") + + # Filter the DataFrame (for example, keep rows where a > 2) + df_filtered = df.filter(col("a") > literal(2)) + + # Register the dataframe as a view with the context + ctx.register_view("view1", df_filtered) + + # Now run a SQL query against the registered view + df_view = ctx.sql("SELECT * FROM view1") + + # Collect the results + results = df_view.collect() + + # Convert results to a list of dictionaries for display + result_dicts = [batch.to_pydict() for batch in results] + + print(result_dicts) + +This will output: + +.. code-block:: python + + [{'a': [3, 4, 5], 'b': [30, 40, 50]}] diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 21955b6d1..befc4dce6 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -707,6 +707,18 @@ def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFr """ return DataFrame(self.ctx.from_polars(data, name)) + # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 + # is the discussion on how we arrived at adding register_view + def register_view(self, name: str, df: DataFrame): + """Register a :py:class: `~datafusion.detaframe.DataFrame` as a view. + + Args: + name (str): The name to register the view under. + df (DataFrame): The DataFrame to be converted into a view and registered. + """ + view = df.into_view() + self.ctx.register_table(name, view) + def register_table(self, name: str, table: Table) -> None: """Register a :py:class: `~datafusion.catalog.Table` as a table. diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 23b5d630b..85a179ec9 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -124,6 +124,10 @@ def __init__(self, df: DataFrameInternal) -> None: """ self.df = df + def into_view(self) -> pa.Table: + """Convert DataFrame as a ViewTable which can be used in register_table.""" + return self.df.into_view() + def __getitem__(self, key: str | List[str]) -> DataFrame: """Return a new :py:class`DataFrame` with the specified column or columns. diff --git a/python/tests/test_view.py b/python/tests/test_view.py new file mode 100644 index 000000000..1d92cc0d4 --- /dev/null +++ b/python/tests/test_view.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from datafusion import SessionContext, col, literal + + +def test_register_filtered_dataframe(): + ctx = SessionContext() + + data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + + df = ctx.from_pydict(data, "my_table") + + df_filtered = df.filter(col("a") > literal(2)) + + ctx.register_view("view1", df_filtered) + + df_view = ctx.sql("SELECT * FROM view1") + + filtered_results = df_view.collect() + + result_dicts = [batch.to_pydict() for batch in filtered_results] + + expected_results = [{"a": [3, 4, 5], "b": [30, 40, 50]}] + + assert result_dicts == expected_results + + df_results = df.collect() + + df_result_dicts = [batch.to_pydict() for batch in df_results] + + expected_df_results = [{"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}] + + assert df_result_dicts == expected_df_results diff --git a/src/dataframe.rs b/src/dataframe.rs index ed9578a71..243e2e14f 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -30,6 +30,7 @@ use datafusion::arrow::util::pretty; use datafusion::common::UnnestOptions; use datafusion::config::{CsvOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; +use datafusion::datasource::TableProvider; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; @@ -39,6 +40,7 @@ use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; use tokio::task::JoinHandle; +use crate::catalog::PyTable; use crate::errors::{py_datafusion_err, PyDataFusionError}; use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; @@ -50,6 +52,25 @@ use crate::{ expr::{sort_expr::PySortExpr, PyExpr}, }; +// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 +// - we have not decided on the table_provider approach yet +// this is an interim implementation +#[pyclass(name = "TableProvider", module = "datafusion")] +pub struct PyTableProvider { + provider: Arc, +} + +impl PyTableProvider { + pub fn new(provider: Arc) -> Self { + Self { provider } + } + + pub fn as_table(&self) -> PyTable { + let table_provider: Arc = self.provider.clone(); + PyTable::new(table_provider) + } +} + /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. @@ -156,6 +177,24 @@ impl PyDataFrame { PyArrowType(self.df.schema().into()) } + /// Convert this DataFrame into a Table that can be used in register_table + /// By convention, into_... methods consume self and return the new object. + /// Disabling the clippy lint, so we can use &self + /// because we're working with Python bindings + /// where objects are shared + /// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 + /// - we have not decided on the table_provider approach yet + #[allow(clippy::wrong_self_convention)] + fn into_view(&self) -> PyDataFusionResult { + // Call the underlying Rust DataFrame::into_view method. + // Note that the Rust method consumes self; here we clone the inner Arc + // so that we don’t invalidate this PyDataFrame. + let table_provider = self.df.as_ref().clone().into_view(); + let table_provider = PyTableProvider::new(table_provider); + + Ok(table_provider.as_table()) + } + #[pyo3(signature = (*args))] fn select_columns(&self, args: Vec) -> PyDataFusionResult { let args = args.iter().map(|s| s.as_ref()).collect::>(); From 9027b4d79fdd7a41dd9c1f25c2ecebc1fabf50f2 Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Sat, 8 Mar 2025 21:24:02 +0800 Subject: [PATCH 103/248] fix: type checking (#993) * fix: type checking * update license * format * format * update catalog * revert type annotation * format * format * update --- python/datafusion/catalog.py | 5 +++-- python/datafusion/context.py | 19 ++++++++++++------ python/datafusion/dataframe.py | 3 ++- python/datafusion/expr.py | 8 ++++---- python/datafusion/functions.py | 10 +++++++--- python/datafusion/input/location.py | 10 +++++----- python/datafusion/udf.py | 7 ++++--- python/tests/test_functions.py | 30 +++++++++++++++++++++++++++++ 8 files changed, 68 insertions(+), 24 deletions(-) diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 703037665..0560f4704 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -66,11 +66,12 @@ def __init__(self, table: df_internal.Table) -> None: """This constructor is not typically called by the end user.""" self.table = table + @property def schema(self) -> pyarrow.Schema: """Returns the schema associated with this table.""" - return self.table.schema() + return self.table.schema @property def kind(self) -> str: """Returns the kind of table.""" - return self.table.kind() + return self.table.kind diff --git a/python/datafusion/context.py b/python/datafusion/context.py index befc4dce6..282b2a477 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -728,7 +728,7 @@ def register_table(self, name: str, table: Table) -> None: name: Name of the resultant table. table: DataFusion table to add to the session context. """ - self.ctx.register_table(name, table) + self.ctx.register_table(name, table.table) def deregister_table(self, name: str) -> None: """Remove a table from the session.""" @@ -767,7 +767,7 @@ def register_parquet( file_extension: str = ".parquet", skip_metadata: bool = True, schema: pyarrow.Schema | None = None, - file_sort_order: list[list[Expr]] | None = None, + file_sort_order: list[list[SortExpr]] | None = None, ) -> None: """Register a Parquet file as a table. @@ -798,7 +798,9 @@ def register_parquet( file_extension, skip_metadata, schema, - file_sort_order, + [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order] + if file_sort_order is not None + else None, ) def register_csv( @@ -934,7 +936,7 @@ def register_udwf(self, udwf: WindowUDF) -> None: def catalog(self, name: str = "datafusion") -> Catalog: """Retrieve a catalog by name.""" - return self.ctx.catalog(name) + return Catalog(self.ctx.catalog(name)) @deprecated( "Use the catalog provider interface ``SessionContext.Catalog`` to " @@ -1054,7 +1056,7 @@ def read_parquet( file_extension: str = ".parquet", skip_metadata: bool = True, schema: pyarrow.Schema | None = None, - file_sort_order: list[list[Expr]] | None = None, + file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> DataFrame: """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. @@ -1078,6 +1080,11 @@ def read_parquet( """ if table_partition_cols is None: table_partition_cols = [] + file_sort_order = ( + [sort_list_to_raw_sort_list(f) for f in file_sort_order] + if file_sort_order is not None + else None + ) return DataFrame( self.ctx.read_parquet( str(path), @@ -1121,7 +1128,7 @@ def read_table(self, table: Table) -> DataFrame: :py:class:`~datafusion.catalog.ListingTable`, create a :py:class:`~datafusion.dataframe.DataFrame`. """ - return DataFrame(self.ctx.read_table(table)) + return DataFrame(self.ctx.read_table(table.table)) def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: """Execute the ``plan`` and return the results.""" diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 85a179ec9..de5d8376e 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -52,6 +52,7 @@ from enum import Enum from datafusion._internal import DataFrame as DataFrameInternal +from datafusion._internal import expr as expr_internal from datafusion.expr import Expr, SortExpr, sort_or_default @@ -277,7 +278,7 @@ def with_columns( def _simplify_expression( *exprs: Expr | Iterable[Expr], **named_exprs: Expr - ) -> list[Expr]: + ) -> list[expr_internal.Expr]: expr_list = [] for expr in exprs: if isinstance(expr, Expr): diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index e3d7158eb..3639abec6 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -176,7 +176,7 @@ def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr: """Helper function to return a default Sort if an Expr is provided.""" if isinstance(e, SortExpr): return e.raw_sort - return SortExpr(e.expr, True, True).raw_sort + return SortExpr(e, True, True).raw_sort def sort_list_to_raw_sort_list( @@ -231,7 +231,7 @@ def variant_name(self) -> str: def __richcmp__(self, other: Expr, op: int) -> Expr: """Comparison operator.""" - return Expr(self.expr.__richcmp__(other, op)) + return Expr(self.expr.__richcmp__(other.expr, op)) def __repr__(self) -> str: """Generate a string representation of this expression.""" @@ -417,7 +417,7 @@ def sort(self, ascending: bool = True, nulls_first: bool = True) -> SortExpr: ascending: If true, sort in ascending order. nulls_first: Return null values first. """ - return SortExpr(self.expr, ascending=ascending, nulls_first=nulls_first) + return SortExpr(self, ascending=ascending, nulls_first=nulls_first) def is_null(self) -> Expr: """Returns ``True`` if this expression is null.""" @@ -789,7 +789,7 @@ class SortExpr: def __init__(self, expr: Expr, ascending: bool, nulls_first: bool) -> None: """This constructor should not be called by the end user.""" - self.raw_sort = expr_internal.SortExpr(expr, ascending, nulls_first) + self.raw_sort = expr_internal.SortExpr(expr.expr, ascending, nulls_first) def expr(self) -> Expr: """Return the raw expr backing the SortExpr.""" diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 5c260aade..b449c4868 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -366,7 +366,7 @@ def concat_ws(separator: str, *args: Expr) -> Expr: def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> SortExpr: """Creates a new sort expression.""" - return SortExpr(expr.expr, ascending=ascending, nulls_first=nulls_first) + return SortExpr(expr, ascending=ascending, nulls_first=nulls_first) def alias(expr: Expr, name: str) -> Expr: @@ -942,6 +942,7 @@ def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_millis(arg.expr, *formatters)) @@ -950,6 +951,7 @@ def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_micros(arg.expr, *formatters)) @@ -958,6 +960,7 @@ def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_nanos(arg.expr, *formatters)) @@ -966,6 +969,7 @@ def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_seconds(arg.expr, *formatters)) @@ -1078,9 +1082,9 @@ def range(start: Expr, stop: Expr, step: Expr) -> Expr: return Expr(f.range(start.expr, stop.expr, step.expr)) -def uuid(arg: Expr) -> Expr: +def uuid() -> Expr: """Returns uuid v4 as a string value.""" - return Expr(f.uuid(arg.expr)) + return Expr(f.uuid()) def struct(*args: Expr) -> Expr: diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index a8252b53c..517cd1578 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -37,12 +37,12 @@ def is_correct_input(self, input_item: Any, table_name: str, **kwargs): def build_table( self, - input_file: str, + input_item: str, table_name: str, **kwargs, ) -> SqlTable: """Create a table from the input source.""" - _, extension = os.path.splitext(input_file) + _, extension = os.path.splitext(input_item) format = extension.lstrip(".").lower() num_rows = 0 # Total number of rows in the file. Used for statistics columns = [] @@ -50,7 +50,7 @@ def build_table( import pyarrow.parquet as pq # Read the Parquet metadata - metadata = pq.read_metadata(input_file) + metadata = pq.read_metadata(input_item) num_rows = metadata.num_rows # Iterate through the schema and build the SqlTable for col in metadata.schema: @@ -69,7 +69,7 @@ def build_table( # to get that information. However, this should only be occurring # at table creation time and therefore shouldn't # slow down query performance. - with open(input_file, "r") as file: + with open(input_item, "r") as file: reader = csv.reader(file) header_row = next(reader) print(header_row) @@ -84,6 +84,6 @@ def build_table( ) # Input could possibly be multiple files. Create a list if so - input_files = glob.glob(input_file) + input_files = glob.glob(input_item) return SqlTable(table_name, columns, num_rows, input_files) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index c97f453d0..0bba3d723 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -85,7 +85,7 @@ class ScalarUDF: def __init__( self, - name: Optional[str], + name: str, func: Callable[..., _R], input_types: pyarrow.DataType | list[pyarrow.DataType], return_type: _R, @@ -182,7 +182,7 @@ class AggregateUDF: def __init__( self, - name: Optional[str], + name: str, accumulator: Callable[[], Accumulator], input_types: list[pyarrow.DataType], return_type: pyarrow.DataType, @@ -277,6 +277,7 @@ def sum_bias_10() -> Summarize: ) if name is None: name = accum.__call__().__class__.__qualname__.lower() + assert name is not None if isinstance(input_types, pyarrow.DataType): input_types = [input_types] return AggregateUDF( @@ -462,7 +463,7 @@ class WindowUDF: def __init__( self, - name: Optional[str], + name: str, func: Callable[[], WindowEvaluator], input_types: list[pyarrow.DataType], return_type: pyarrow.DataType, diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index b1a739b49..fca05bb8f 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -871,7 +871,22 @@ def test_temporal_functions(df): f.to_timestamp_millis(literal("2023-09-07 05:06:14.523952")), f.to_timestamp_micros(literal("2023-09-07 05:06:14.523952")), f.extract(literal("day"), column("d")), + f.to_timestamp( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), + f.to_timestamp_seconds( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), + f.to_timestamp_millis( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), + f.to_timestamp_micros( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), f.to_timestamp_nanos(literal("2023-09-07 05:06:14.523952")), + f.to_timestamp_nanos( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), ) result = df.collect() assert len(result) == 1 @@ -913,6 +928,21 @@ def test_temporal_functions(df): assert result.column(11) == pa.array( [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") ) + assert result.column(12) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") + ) + assert result.column(13) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") + ) + assert result.column(14) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") + ) + assert result.column(15) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + ) + assert result.column(16) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + ) def test_arrow_cast(df): From acd70409f73f299a144e7ff4115c6e6035c3ffb5 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Sat, 8 Mar 2025 16:37:10 +0100 Subject: [PATCH 104/248] feat: reads using global ctx (#982) * feat: reads using global ctx * Add text to io methods to describe the context they are using --------- Co-authored-by: Tim Saucer --- python/datafusion/__init__.py | 5 + python/datafusion/io.py | 199 ++++++++++++++++++++++++++ python/tests/test_io.py | 95 ++++++++++++ python/tests/test_wrapper_coverage.py | 2 + src/context.rs | 12 +- src/utils.rs | 8 ++ 6 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 python/datafusion/io.py create mode 100644 python/tests/test_io.py diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 85aefcce7..f11ce54a6 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -45,6 +45,7 @@ Expr, WindowFrame, ) +from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF @@ -81,6 +82,10 @@ "functions", "object_store", "substrait", + "read_parquet", + "read_avro", + "read_csv", + "read_json", ] diff --git a/python/datafusion/io.py b/python/datafusion/io.py new file mode 100644 index 000000000..7f3b77efa --- /dev/null +++ b/python/datafusion/io.py @@ -0,0 +1,199 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""IO read functions using global context.""" + +import pathlib + +import pyarrow + +from datafusion.dataframe import DataFrame +from datafusion.expr import Expr + +from ._internal import SessionContext as SessionContextInternal + + +def read_parquet( + path: str | pathlib.Path, + table_partition_cols: list[tuple[str, str]] | None = None, + parquet_pruning: bool = True, + file_extension: str = ".parquet", + skip_metadata: bool = True, + schema: pyarrow.Schema | None = None, + file_sort_order: list[list[Expr]] | None = None, +) -> DataFrame: + """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the Parquet file. + table_partition_cols: Partition columns. + parquet_pruning: Whether the parquet reader should use the predicate + to prune row groups. + file_extension: File extension; only files with this extension are + selected for data input. + skip_metadata: Whether the parquet reader should skip any metadata + that may be in the file schema. This can help avoid schema + conflicts due to metadata. + schema: An optional schema representing the parquet files. If None, + the parquet reader will try to infer it based on data in the + file. + file_sort_order: Sort order for the file. + + Returns: + DataFrame representation of the read Parquet files + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + SessionContextInternal._global_ctx().read_parquet( + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, + ) + ) + + +def read_json( + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + schema_infer_max_records: int = 1000, + file_extension: str = ".json", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, +) -> DataFrame: + """Read a line-delimited JSON data source. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the JSON file. + schema: The data source schema. + schema_infer_max_records: Maximum number of rows to read from JSON + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read JSON files. + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + SessionContextInternal._global_ctx().read_json( + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + +def read_csv( + path: str | pathlib.Path | list[str] | list[pathlib.Path], + schema: pyarrow.Schema | None = None, + has_header: bool = True, + delimiter: str = ",", + schema_infer_max_records: int = 1000, + file_extension: str = ".csv", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, +) -> DataFrame: + """Read a CSV data source. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the CSV file + schema: An optional schema representing the CSV files. If None, the + CSV reader will try to infer it based on data in file. + has_header: Whether the CSV file have a header. If schema inference + is run on a file with no headers, default column names are + created. + delimiter: An optional column delimiter. + schema_infer_max_records: Maximum number of rows to read from CSV + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read CSV files + """ + if table_partition_cols is None: + table_partition_cols = [] + + path = [str(p) for p in path] if isinstance(path, list) else str(path) + + return DataFrame( + SessionContextInternal._global_ctx().read_csv( + path, + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + +def read_avro( + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + file_partition_cols: list[tuple[str, str]] | None = None, + file_extension: str = ".avro", +) -> DataFrame: + """Create a :py:class:`DataFrame` for reading Avro data source. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the Avro file. + schema: The data source schema. + file_partition_cols: Partition columns. + file_extension: File extension to select. + + Returns: + DataFrame representation of the read Avro file + """ + if file_partition_cols is None: + file_partition_cols = [] + return DataFrame( + SessionContextInternal._global_ctx().read_avro( + str(path), schema, file_partition_cols, file_extension + ) + ) diff --git a/python/tests/test_io.py b/python/tests/test_io.py new file mode 100644 index 000000000..21ad188ee --- /dev/null +++ b/python/tests/test_io.py @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import os +import pathlib + +import pyarrow as pa +from datafusion import column +from datafusion.io import read_avro, read_csv, read_json, read_parquet + + +def test_read_json_global_ctx(ctx): + path = os.path.dirname(os.path.abspath(__file__)) + + # Default + test_data_path = os.path.join(path, "data_test_context", "data.json") + df = read_json(test_data_path) + result = df.collect() + + assert result[0].column(0) == pa.array(["a", "b", "c"]) + assert result[0].column(1) == pa.array([1, 2, 3]) + + # Schema + schema = pa.schema( + [ + pa.field("A", pa.string(), nullable=True), + ] + ) + df = read_json(test_data_path, schema=schema) + result = df.collect() + + assert result[0].column(0) == pa.array(["a", "b", "c"]) + assert result[0].schema == schema + + # File extension + test_data_path = os.path.join(path, "data_test_context", "data.json") + df = read_json(test_data_path, file_extension=".json") + result = df.collect() + + assert result[0].column(0) == pa.array(["a", "b", "c"]) + assert result[0].column(1) == pa.array([1, 2, 3]) + + +def test_read_parquet_global(): + parquet_df = read_parquet(path="parquet/data/alltypes_plain.parquet") + parquet_df.show() + assert parquet_df is not None + + path = pathlib.Path.cwd() / "parquet/data/alltypes_plain.parquet" + parquet_df = read_parquet(path=path) + assert parquet_df is not None + + +def test_read_csv(): + csv_df = read_csv(path="testing/data/csv/aggregate_test_100.csv") + csv_df.select(column("c1")).show() + + +def test_read_csv_list(): + csv_df = read_csv(path=["testing/data/csv/aggregate_test_100.csv"]) + expected = csv_df.count() * 2 + + double_csv_df = read_csv( + path=[ + "testing/data/csv/aggregate_test_100.csv", + "testing/data/csv/aggregate_test_100.csv", + ] + ) + actual = double_csv_df.count() + + double_csv_df.select(column("c1")).show() + assert actual == expected + + +def test_read_avro(): + avro_df = read_avro(path="testing/data/avro/alltypes_plain.avro") + avro_df.show() + assert avro_df is not None + + path = pathlib.Path.cwd() / "testing/data/avro/alltypes_plain.avro" + avro_df = read_avro(path=path) + assert avro_df is not None diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index 86f2d57f2..ac064ba95 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -34,6 +34,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: return for attr in dir(internal_obj): + if attr in ["_global_ctx"]: + continue assert attr in dir(wrapped_obj) internal_attr = getattr(internal_obj, attr) diff --git a/src/context.rs b/src/context.rs index 0f962638e..9ba87eb8a 100644 --- a/src/context.rs +++ b/src/context.rs @@ -44,7 +44,7 @@ use crate::store::StorageContexts; use crate::udaf::PyAggregateUDF; use crate::udf::PyScalarUDF; use crate::udwf::PyWindowUDF; -use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; +use crate::utils::{get_global_ctx, get_tokio_runtime, validate_pycapsule, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; @@ -69,7 +69,7 @@ use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, }; use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; -use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple}; +use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; use tokio::task::JoinHandle; /// Configuration options for a SessionContext @@ -306,6 +306,14 @@ impl PySessionContext { }) } + #[classmethod] + #[pyo3(signature = ())] + fn _global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { + Ok(Self { + ctx: get_global_ctx().clone(), + }) + } + /// Register an object store with the given name #[pyo3(signature = (scheme, store, host=None))] pub fn register_object_store( diff --git a/src/utils.rs b/src/utils.rs index ed224b364..999aad755 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -17,6 +17,7 @@ use crate::errors::{PyDataFusionError, PyDataFusionResult}; use crate::TokioRuntime; +use datafusion::execution::context::SessionContext; use datafusion::logical_expr::Volatility; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; @@ -37,6 +38,13 @@ pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { RUNTIME.get_or_init(|| TokioRuntime(tokio::runtime::Runtime::new().unwrap())) } +/// Utility to get the Global Datafussion CTX +#[inline] +pub(crate) fn get_global_ctx() -> &'static SessionContext { + static CTX: OnceLock = OnceLock::new(); + CTX.get_or_init(|| SessionContext::new()) +} + /// Utility to collect rust futures with GIL released pub fn wait_for_future(py: Python, f: F) -> F::Output where From 973d7ec4a8196a78bc4fb32db4f24e523997ba4c Mon Sep 17 00:00:00 2001 From: Crystal Zhou <45134936+CrystalZhou0529@users.noreply.github.com> Date: Sat, 8 Mar 2025 16:23:54 -0500 Subject: [PATCH 105/248] feat: Implementation of udf and udaf decorator (#1040) * Implementation of udf and udaf decorator * Rename decorators back to udf and udaf, update documentations * Minor typo fixes * Fixing linting errors * ruff formatting --------- Co-authored-by: Tim Saucer --- python/datafusion/udf.py | 257 +++++++++++++++++++++++++++----------- python/tests/test_udaf.py | 42 +++++++ python/tests/test_udf.py | 42 ++++++- 3 files changed, 265 insertions(+), 76 deletions(-) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 0bba3d723..af7bcf2ed 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -19,6 +19,7 @@ from __future__ import annotations +import functools from abc import ABCMeta, abstractmethod from enum import Enum from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar @@ -110,43 +111,102 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udf.__call__(*args_raw)) - @staticmethod - def udf( - func: Callable[..., _R], - input_types: list[pyarrow.DataType], - return_type: _R, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> ScalarUDF: - """Create a new User-Defined Function. + class udf: + """Create a new User-Defined Function (UDF). + + This class can be used both as a **function** and as a **decorator**. + + Usage: + - **As a function**: Call `udf(func, input_types, return_type, volatility, + name)`. + - **As a decorator**: Use `@udf(input_types, return_type, volatility, + name)`. In this case, do **not** pass `func` explicitly. Args: - func: A callable python function. - input_types: The data types of the arguments to ``func``. This list - must be of the same length as the number of arguments. - return_type: The data type of the return value from the python - function. - volatility: See ``Volatility`` for allowed values. - name: A descriptive name for the function. + func (Callable, optional): **Only needed when calling as a function.** + Skip this argument when using `udf` as a decorator. + input_types (list[pyarrow.DataType]): The data types of the arguments + to `func`. This list must be of the same length as the number of + arguments. + return_type (_R): The data type of the return value from the function. + volatility (Volatility | str): See `Volatility` for allowed values. + name (Optional[str]): A descriptive name for the function. Returns: - A user-defined aggregate function, which can be used in either data - aggregation or window function calls. + A user-defined function that can be used in SQL expressions, + data aggregation, or window function calls. + + Example: + **Using `udf` as a function:** + ``` + def double_func(x): + return x * 2 + double_udf = udf(double_func, [pyarrow.int32()], pyarrow.int32(), + "volatile", "double_it") + ``` + + **Using `udf` as a decorator:** + ``` + @udf([pyarrow.int32()], pyarrow.int32(), "volatile", "double_it") + def double_udf(x): + return x * 2 + ``` """ - if not callable(func): - raise TypeError("`func` argument must be callable") - if name is None: - if hasattr(func, "__qualname__"): - name = func.__qualname__.lower() + + def __new__(cls, *args, **kwargs): + """Create a new UDF. + + Trigger UDF function or decorator depending on if the first args is callable + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return cls._function(*args, **kwargs) else: - name = func.__class__.__name__.lower() - return ScalarUDF( - name=name, - func=func, - input_types=input_types, - return_type=return_type, - volatility=volatility, - ) + # Case 2: Used as a decorator with parameters + return cls._decorator(*args, **kwargs) + + @staticmethod + def _function( + func: Callable[..., _R], + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> ScalarUDF: + if not callable(func): + raise TypeError("`func` argument must be callable") + if name is None: + if hasattr(func, "__qualname__"): + name = func.__qualname__.lower() + else: + name = func.__class__.__name__.lower() + return ScalarUDF( + name=name, + func=func, + input_types=input_types, + return_type=return_type, + volatility=volatility, + ) + + @staticmethod + def _decorator( + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ): + def decorator(func): + udf_caller = ScalarUDF.udf( + func, input_types, return_type, volatility, name + ) + + @functools.wraps(func) + def wrapper(*args, **kwargs): + return udf_caller(*args, **kwargs) + + return wrapper + + return decorator class Accumulator(metaclass=ABCMeta): @@ -212,25 +272,27 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udaf.__call__(*args_raw)) - @staticmethod - def udaf( - accum: Callable[[], Accumulator], - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], - volatility: Volatility | str, - name: Optional[str] = None, - ) -> AggregateUDF: - """Create a new User-Defined Aggregate Function. + class udaf: + """Create a new User-Defined Aggregate Function (UDAF). - If your :py:class:`Accumulator` can be instantiated with no arguments, you - can simply pass it's type as ``accum``. If you need to pass additional arguments - to it's constructor, you can define a lambda or a factory method. During runtime - the :py:class:`Accumulator` will be constructed for every instance in - which this UDAF is used. The following examples are all valid. + This class allows you to define an **aggregate function** that can be used in + data aggregation or window function calls. - .. code-block:: python + Usage: + - **As a function**: Call `udaf(accum, input_types, return_type, state_type, + volatility, name)`. + - **As a decorator**: Use `@udaf(input_types, return_type, state_type, + volatility, name)`. + When using `udaf` as a decorator, **do not pass `accum` explicitly**. + **Function example:** + + If your `:py:class:Accumulator` can be instantiated with no arguments, you + can simply pass it's type as `accum`. If you need to pass additional + arguments to it's constructor, you can define a lambda or a factory method. + During runtime the `:py:class:Accumulator` will be constructed for every + instance in which this UDAF is used. The following examples are all valid. + ``` import pyarrow as pa import pyarrow.compute as pc @@ -253,12 +315,24 @@ def evaluate(self) -> pa.Scalar: def sum_bias_10() -> Summarize: return Summarize(10.0) - udaf1 = udaf(Summarize, pa.float64(), pa.float64(), [pa.float64()], "immutable") - udaf2 = udaf(sum_bias_10, pa.float64(), pa.float64(), [pa.float64()], "immutable") - udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), [pa.float64()], "immutable") + udaf1 = udaf(Summarize, pa.float64(), pa.float64(), [pa.float64()], + "immutable") + udaf2 = udaf(sum_bias_10, pa.float64(), pa.float64(), [pa.float64()], + "immutable") + udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), + [pa.float64()], "immutable") + ``` + + **Decorator example:** + ``` + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def udf4() -> Summarize: + return Summarize(10.0) + ``` Args: - accum: The accumulator python function. + accum: The accumulator python function. **Only needed when calling as a + function. Skip this argument when using `udaf` as a decorator.** input_types: The data types of the arguments to ``accum``. return_type: The data type of the return value. state_type: The data types of the intermediate accumulation. @@ -268,26 +342,69 @@ def sum_bias_10() -> Summarize: Returns: A user-defined aggregate function, which can be used in either data aggregation or window function calls. - """ # noqa W505 - if not callable(accum): - raise TypeError("`func` must be callable.") - if not isinstance(accum.__call__(), Accumulator): - raise TypeError( - "Accumulator must implement the abstract base class Accumulator" + """ + + def __new__(cls, *args, **kwargs): + """Create a new UDAF. + + Trigger UDAF function or decorator depending on if the first args is + callable + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return cls._function(*args, **kwargs) + else: + # Case 2: Used as a decorator with parameters + return cls._decorator(*args, **kwargs) + + @staticmethod + def _function( + accum: Callable[[], Accumulator], + input_types: pyarrow.DataType | list[pyarrow.DataType], + return_type: pyarrow.DataType, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> AggregateUDF: + if not callable(accum): + raise TypeError("`func` must be callable.") + if not isinstance(accum.__call__(), Accumulator): + raise TypeError( + "Accumulator must implement the abstract base class Accumulator" + ) + if name is None: + name = accum.__call__().__class__.__qualname__.lower() + if isinstance(input_types, pyarrow.DataType): + input_types = [input_types] + return AggregateUDF( + name=name, + accumulator=accum, + input_types=input_types, + return_type=return_type, + state_type=state_type, + volatility=volatility, ) - if name is None: - name = accum.__call__().__class__.__qualname__.lower() - assert name is not None - if isinstance(input_types, pyarrow.DataType): - input_types = [input_types] - return AggregateUDF( - name=name, - accumulator=accum, - input_types=input_types, - return_type=return_type, - state_type=state_type, - volatility=volatility, - ) + + @staticmethod + def _decorator( + input_types: pyarrow.DataType | list[pyarrow.DataType], + return_type: pyarrow.DataType, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ): + def decorator(accum: Callable[[], Accumulator]): + udaf_caller = AggregateUDF.udaf( + accum, input_types, return_type, state_type, volatility, name + ) + + @functools.wraps(accum) + def wrapper(*args, **kwargs): + return udaf_caller(*args, **kwargs) + + return wrapper + + return decorator class WindowEvaluator(metaclass=ABCMeta): diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index 0005a3da8..e69c77d3c 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -117,6 +117,26 @@ def test_udaf_aggregate(df): assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) +def test_udaf_decorator_aggregate(df): + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def summarize(): + return Summarize() + + df1 = df.aggregate([], [summarize(column("a"))]) + + # execute and collect the first (and only) batch + result = df1.collect()[0] + + assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) + + df2 = df.aggregate([], [summarize(column("a"))]) + + # Run a second time to ensure the state is properly reset + result = df2.collect()[0] + + assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) + + def test_udaf_aggregate_with_arguments(df): bias = 10.0 @@ -143,6 +163,28 @@ def test_udaf_aggregate_with_arguments(df): assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) +def test_udaf_decorator_aggregate_with_arguments(df): + bias = 10.0 + + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def summarize(): + return Summarize(bias) + + df1 = df.aggregate([], [summarize(column("a"))]) + + # execute and collect the first (and only) batch + result = df1.collect()[0] + + assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) + + df2 = df.aggregate([], [summarize(column("a"))]) + + # Run a second time to ensure the state is properly reset + result = df2.collect()[0] + + assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) + + def test_group_by(df): summarize = udaf( Summarize, diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py index 3a5dce6d6..a6c047552 100644 --- a/python/tests/test_udf.py +++ b/python/tests/test_udf.py @@ -24,7 +24,7 @@ def df(ctx): # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 4, 6])], + [pa.array([1, 2, 3]), pa.array([4, 4, None])], names=["a", "b"], ) return ctx.create_dataframe([[batch]], name="test_table") @@ -39,10 +39,20 @@ def test_udf(df): volatility="immutable", ) - df = df.select(is_null(column("a"))) + df = df.select(is_null(column("b"))) result = df.collect()[0].column(0) - assert result == pa.array([False, False, False]) + assert result == pa.array([False, False, True]) + + +def test_udf_decorator(df): + @udf([pa.int64()], pa.bool_(), "immutable") + def is_null(x: pa.Array) -> pa.Array: + return x.is_null() + + df = df.select(is_null(column("b"))) + result = df.collect()[0].column(0) + assert result == pa.array([False, False, True]) def test_register_udf(ctx, df) -> None: @@ -56,10 +66,10 @@ def test_register_udf(ctx, df) -> None: ctx.register_udf(is_null) - df_result = ctx.sql("select is_null(a) from test_table") + df_result = ctx.sql("select is_null(b) from test_table") result = df_result.collect()[0].column(0) - assert result == pa.array([False, False, False]) + assert result == pa.array([False, False, True]) class OverThresholdUDF: @@ -70,7 +80,7 @@ def __call__(self, values: pa.Array) -> pa.Array: return pa.array(v.as_py() >= self.threshold for v in values) -def test_udf_with_parameters(df) -> None: +def test_udf_with_parameters_function(df) -> None: udf_no_param = udf( OverThresholdUDF(), pa.int64(), @@ -94,3 +104,23 @@ def test_udf_with_parameters(df) -> None: result = df2.collect()[0].column(0) assert result == pa.array([False, True, True]) + + +def test_udf_with_parameters_decorator(df) -> None: + @udf([pa.int64()], pa.bool_(), "immutable") + def udf_no_param(values: pa.Array) -> pa.Array: + return OverThresholdUDF()(values) + + df1 = df.select(udf_no_param(column("a"))) + result = df1.collect()[0].column(0) + + assert result == pa.array([True, True, True]) + + @udf([pa.int64()], pa.bool_(), "immutable") + def udf_with_param(values: pa.Array) -> pa.Array: + return OverThresholdUDF(2)(values) + + df2 = df.select(udf_with_param(column("a"))) + result = df2.collect()[0].column(0) + + assert result == pa.array([False, True, True]) From d72f5605b3d523585d04857505793920f96242ba Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Mar 2025 06:56:12 -0400 Subject: [PATCH 106/248] Enable FA ruff lint (#1052) --- examples/python-udwf.py | 2 ++ pyproject.toml | 2 +- python/datafusion/io.py | 2 ++ python/tests/test_udaf.py | 2 ++ python/tests/test_udwf.py | 2 ++ 5 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 32f8fadaa..7d39dc1b8 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import datafusion import pyarrow as pa from datafusion import col, lit, udwf diff --git a/pyproject.toml b/pyproject.toml index f416e02a5..d16a18aa6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ features = ["substrait"] # Enable docstring linting using the google style guide [tool.ruff.lint] -select = ["E4", "E7", "E9", "F", "D", "W", "I"] +select = ["E4", "E7", "E9", "F", "FA", "D", "W", "I"] [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 7f3b77efa..3b6264948 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -17,6 +17,8 @@ """IO read functions using global context.""" +from __future__ import annotations + import pathlib import pyarrow diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index e69c77d3c..97cf81f3c 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + from typing import List import pyarrow as pa diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 0ffa04179..2fea34aa3 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import pyarrow as pa import pytest from datafusion import SessionContext, column, lit, udwf From 0002372ccdb780e011631c797ec9613174cf0a94 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Mar 2025 14:22:42 -0400 Subject: [PATCH 107/248] Enable take comments to assign issues to users (#1058) --- .github/workflows/take.yml | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .github/workflows/take.yml diff --git a/.github/workflows/take.yml b/.github/workflows/take.yml new file mode 100644 index 000000000..86dc190ad --- /dev/null +++ b/.github/workflows/take.yml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Assign the issue via a `take` comment +on: + issue_comment: + types: created + +permissions: + issues: write + +jobs: + issue_assign: + runs-on: ubuntu-latest + if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' + concurrency: + group: ${{ github.actor }}-issue-assign + steps: + - run: | + CODE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}\n' -s) + if [ "$CODE" -eq "204" ] + then + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + else + echo "Cannot assign issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + fi \ No newline at end of file From 9d634de6df2f8b76bd303ab1f5972f01deb2210d Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 10 Mar 2025 14:24:40 -0400 Subject: [PATCH 108/248] Update python min version to 3.9 (#1043) * 3.8 -> 3.9 * upgrade pyo3 abi3-py38 -> abi3-py39 --- Cargo.toml | 2 +- .../source/contributor-guide/introduction.rst | 2 +- examples/ffi-table-provider/Cargo.lock | 75 +- examples/ffi-table-provider/Cargo.toml | 2 +- examples/ffi-table-provider/pyproject.toml | 2 +- pyproject.toml | 3 +- uv.lock | 707 ++---------------- 7 files changed, 121 insertions(+), 672 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5358b1836..50967a219 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ substrait = ["dep:datafusion-substrait"] [dependencies] tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } -pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} arrow = { version = "54", features = ["pyarrow"] } datafusion = { version = "45.0.0", features = ["avro", "unicode_expressions"] } diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst index 25f2c21a4..2fba64111 100644 --- a/docs/source/contributor-guide/introduction.rst +++ b/docs/source/contributor-guide/introduction.rst @@ -118,7 +118,7 @@ be ignored by ``git``. .. code-block:: implementation=CPython - version=3.8 + version=3.9 shared=true abi3=true lib_name=python3.12 diff --git a/examples/ffi-table-provider/Cargo.lock b/examples/ffi-table-provider/Cargo.lock index 32af85180..8d0edd515 100644 --- a/examples/ffi-table-provider/Cargo.lock +++ b/examples/ffi-table-provider/Cargo.lock @@ -766,7 +766,8 @@ dependencies = [ [[package]] name = "datafusion" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" dependencies = [ "arrow", "arrow-array", @@ -816,7 +817,8 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" dependencies = [ "arrow", "async-trait", @@ -836,7 +838,8 @@ dependencies = [ [[package]] name = "datafusion-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" dependencies = [ "ahash", "arrow", @@ -862,7 +865,8 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" dependencies = [ "log", "tokio", @@ -871,12 +875,14 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" [[package]] name = "datafusion-execution" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" dependencies = [ "arrow", "dashmap", @@ -894,7 +900,8 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" dependencies = [ "arrow", "chrono", @@ -914,7 +921,8 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" dependencies = [ "arrow", "datafusion-common", @@ -925,7 +933,8 @@ dependencies = [ [[package]] name = "datafusion-ffi" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff47a79d442207c168c6e3e1d970c248589c148e4800e5b285ac1b2cb1a230f8" dependencies = [ "abi_stable", "arrow", @@ -945,7 +954,8 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" dependencies = [ "arrow", "arrow-buffer", @@ -974,7 +984,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" dependencies = [ "ahash", "arrow", @@ -996,7 +1007,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" dependencies = [ "ahash", "arrow", @@ -1008,7 +1020,8 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" dependencies = [ "arrow", "arrow-array", @@ -1031,7 +1044,8 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" dependencies = [ "arrow", "async-trait", @@ -1046,7 +1060,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1062,7 +1077,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1071,7 +1087,8 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" dependencies = [ "datafusion-expr", "quote", @@ -1081,7 +1098,8 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" dependencies = [ "arrow", "chrono", @@ -1099,7 +1117,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" dependencies = [ "ahash", "arrow", @@ -1123,7 +1142,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" dependencies = [ "ahash", "arrow", @@ -1137,7 +1157,8 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" dependencies = [ "arrow", "arrow-schema", @@ -1158,7 +1179,8 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" dependencies = [ "ahash", "arrow", @@ -1189,7 +1211,8 @@ dependencies = [ [[package]] name = "datafusion-proto" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2db5d79f0c974041787b899d24dc91bdab2ff112d1942dd71356a4ce3b407e6c" dependencies = [ "arrow", "chrono", @@ -1204,7 +1227,8 @@ dependencies = [ [[package]] name = "datafusion-proto-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de21bde1603aac0ff32cf478e47081be6e3583c6861fe8f57034da911efe7578" dependencies = [ "arrow", "datafusion-common", @@ -1214,7 +1238,8 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" dependencies = [ "arrow", "arrow-array", diff --git a/examples/ffi-table-provider/Cargo.toml b/examples/ffi-table-provider/Cargo.toml index 0e558fdd0..f4e4fda79 100644 --- a/examples/ffi-table-provider/Cargo.toml +++ b/examples/ffi-table-provider/Cargo.toml @@ -23,7 +23,7 @@ edition = "2021" [dependencies] datafusion = { version = "45.0.0" } datafusion-ffi = { version = "45.0.0" } -pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } arrow = { version = "54" } arrow-array = { version = "54" } arrow-schema = { version = "54" } diff --git a/examples/ffi-table-provider/pyproject.toml b/examples/ffi-table-provider/pyproject.toml index 116efae9c..9cd25b423 100644 --- a/examples/ffi-table-provider/pyproject.toml +++ b/examples/ffi-table-provider/pyproject.toml @@ -21,7 +21,7 @@ build-backend = "maturin" [project] name = "ffi_table_provider" -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", diff --git a/pyproject.toml b/pyproject.toml index d16a18aa6..1c2733677 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ name = "datafusion" description = "Build and run queries against data" readme = "README.md" license = { file = "LICENSE.txt" } -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = ["datafusion", "dataframe", "rust", "query-engine"] classifiers = [ "Development Status :: 2 - Pre-Alpha", @@ -35,7 +35,6 @@ classifiers = [ "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/uv.lock b/uv.lock index 587ddc8b7..619b92856 100644 --- a/uv.lock +++ b/uv.lock @@ -1,23 +1,10 @@ version = 1 -requires-python = ">=3.8" +requires-python = ">=3.9" resolution-markers = [ "python_full_version >= '3.12'", "python_full_version == '3.11.*'", "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", - "python_full_version < '3.9'", -] - -[[package]] -name = "alabaster" -version = "0.7.13" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/94/71/a8ee96d1fd95ca04a0d2e2d9c4081dac4c2d2b12f7ddb899c8cb9bfd1532/alabaster-0.7.13.tar.gz", hash = "sha256:a27a4a084d5e690e16e01e03ad2b2e552c61a65469419b907243193de1a84ae2", size = 11454 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/64/88/c7083fc61120ab661c5d0b82cb77079fc1429d3f913a456c1c82cf4658f7/alabaster-0.7.13-py3-none-any.whl", hash = "sha256:1ee19aca801bbabb5ba3f5f258e4422dfa86f82f3e9cefb0859b283cdd7f62a3", size = 13857 }, + "python_full_version < '3.10'", ] [[package]] @@ -25,7 +12,7 @@ name = "alabaster" version = "0.7.16" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/c9/3e/13dd8e5ed9094e734ac430b5d0eb4f2bb001708a8b7856cbf8e084e001ba/alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", size = 23776 } wheels = [ @@ -46,42 +33,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929 }, ] -[[package]] -name = "appnope" -version = "0.1.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321 }, -] - -[[package]] -name = "astroid" -version = "3.2.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/53/1067e1113ecaf58312357f2cd93063674924119d80d173adc3f6f2387aa2/astroid-3.2.4.tar.gz", hash = "sha256:0e14202810b30da1b735827f78f5157be2bbd4a7a59b7707ca0bfc2fb4c0063a", size = 397576 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/80/96/b32bbbb46170a1c8b8b1f28c794202e25cfe743565e9d3469b8eb1e0cc05/astroid-3.2.4-py3-none-any.whl", hash = "sha256:413658a61eeca6202a59231abb473f932038fbcbf1666587f66d482083413a25", size = 276348 }, -] - [[package]] name = "astroid" version = "3.3.8" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] dependencies = [ - { name = "typing-extensions", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/80/c5/5c83c48bbf547f3dd8b587529db7cf5a265a3368b33e85e76af8ff6061d3/astroid-3.3.8.tar.gz", hash = "sha256:a88c7994f914a4ea8572fac479459f4955eeccc877be3f2d959a33273b0cf40b", size = 398196 } wheels = [ @@ -101,23 +58,11 @@ wheels = [ name = "babel" version = "2.16.0" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pytz", marker = "python_full_version < '3.9'" }, -] sdist = { url = "https://files.pythonhosted.org/packages/2a/74/f1bc80f23eeba13393b7222b11d95ca3af2c1e28edca18af487137eefed9/babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316", size = 9348104 } wheels = [ { url = "https://files.pythonhosted.org/packages/ed/20/bc79bc575ba2e2a7f70e8a1155618bb1301eaa5132a8271373a6903f73f8/babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b", size = 9587599 }, ] -[[package]] -name = "backcall" -version = "0.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/40/764a663805d84deee23043e1426a9175567db89c8b3287b5c2ad9f71aa93/backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e", size = 18041 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4c/1c/ff6546b6c12603d8dd1070aa3c3d273ad4c07f5771689a7b69a550e8c951/backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255", size = 11157 }, -] - [[package]] name = "beautifulsoup4" version = "4.12.3" @@ -194,14 +139,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469 }, { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475 }, { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 }, - { url = "https://files.pythonhosted.org/packages/48/08/15bf6b43ae9bd06f6b00ad8a91f5a8fe1069d4c9fab550a866755402724e/cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b", size = 182457 }, - { url = "https://files.pythonhosted.org/packages/c2/5b/f1523dd545f92f7df468e5f653ffa4df30ac222f3c884e51e139878f1cb5/cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964", size = 425932 }, - { url = "https://files.pythonhosted.org/packages/53/93/7e547ab4105969cc8c93b38a667b82a835dd2cc78f3a7dad6130cfd41e1d/cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9", size = 448585 }, - { url = "https://files.pythonhosted.org/packages/56/c4/a308f2c332006206bb511de219efeff090e9d63529ba0a77aae72e82248b/cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc", size = 456268 }, - { url = "https://files.pythonhosted.org/packages/ca/5b/b63681518265f2f4060d2b60755c1c77ec89e5e045fc3773b72735ddaad5/cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c", size = 436592 }, - { url = "https://files.pythonhosted.org/packages/bb/19/b51af9f4a4faa4a8ac5a0e5d5c2522dcd9703d07fac69da34a36c4d960d3/cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1", size = 446512 }, - { url = "https://files.pythonhosted.org/packages/e2/63/2bed8323890cb613bbecda807688a31ed11a7fe7afe31f8faaae0206a9a3/cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8", size = 171576 }, - { url = "https://files.pythonhosted.org/packages/2f/70/80c33b044ebc79527447fd4fbc5455d514c3bb840dede4455de97da39b4d/cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1", size = 181229 }, { url = "https://files.pythonhosted.org/packages/b9/ea/8bb50596b8ffbc49ddd7a1ad305035daa770202a6b782fc164647c2673ad/cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16", size = 182220 }, { url = "https://files.pythonhosted.org/packages/ae/11/e77c8cd24f58285a82c23af484cf5b124a376b32644e445960d1a4654c3a/cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36", size = 178605 }, { url = "https://files.pythonhosted.org/packages/ed/65/25a8dc32c53bf5b7b6c2686b42ae2ad58743f7ff644844af7cdb29b49361/cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8", size = 424910 }, @@ -274,19 +211,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e4/93/946a86ce20790e11312c87c75ba68d5f6ad2208cfb52b2d6a2c32840d922/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd", size = 145732 }, { url = "https://files.pythonhosted.org/packages/cd/e5/131d2fb1b0dddafc37be4f3a2fa79aa4c037368be9423061dccadfd90091/charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407", size = 95391 }, { url = "https://files.pythonhosted.org/packages/27/f2/4f9a69cc7712b9b5ad8fdb87039fd89abba997ad5cbe690d1835d40405b0/charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971", size = 102702 }, - { url = "https://files.pythonhosted.org/packages/10/bd/6517ea94f2672e801011d50b5d06be2a0deaf566aea27bcdcd47e5195357/charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c", size = 195653 }, - { url = "https://files.pythonhosted.org/packages/e5/0d/815a2ba3f283b4eeaa5ece57acade365c5b4135f65a807a083c818716582/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9", size = 140701 }, - { url = "https://files.pythonhosted.org/packages/aa/17/c94be7ee0d142687e047fe1de72060f6d6837f40eedc26e87e6e124a3fc6/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8", size = 150495 }, - { url = "https://files.pythonhosted.org/packages/f7/33/557ac796c47165fc141e4fb71d7b0310f67e05cb420756f3a82e0a0068e0/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6", size = 142946 }, - { url = "https://files.pythonhosted.org/packages/1e/0d/38ef4ae41e9248d63fc4998d933cae22473b1b2ac4122cf908d0f5eb32aa/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c", size = 144737 }, - { url = "https://files.pythonhosted.org/packages/43/01/754cdb29dd0560f58290aaaa284d43eea343ad0512e6ad3b8b5c11f08592/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a", size = 147471 }, - { url = "https://files.pythonhosted.org/packages/ba/cd/861883ba5160c7a9bd242c30b2c71074cda2aefcc0addc91118e0d4e0765/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd", size = 140801 }, - { url = "https://files.pythonhosted.org/packages/6f/7f/0c0dad447819e90b93f8ed238cc8f11b91353c23c19e70fa80483a155bed/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd", size = 149312 }, - { url = "https://files.pythonhosted.org/packages/8e/09/9f8abcc6fff60fb727268b63c376c8c79cc37b833c2dfe1f535dfb59523b/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824", size = 152347 }, - { url = "https://files.pythonhosted.org/packages/be/e5/3f363dad2e24378f88ccf63ecc39e817c29f32e308ef21a7a6d9c1201165/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca", size = 149888 }, - { url = "https://files.pythonhosted.org/packages/e4/10/a78c0e91f487b4ad0ef7480ac765e15b774f83de2597f1b6ef0eaf7a2f99/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b", size = 145169 }, - { url = "https://files.pythonhosted.org/packages/d3/81/396e7d7f5d7420da8273c91175d2e9a3f569288e3611d521685e4b9ac9cc/charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e", size = 95094 }, - { url = "https://files.pythonhosted.org/packages/40/bb/20affbbd9ea29c71ea123769dc568a6d42052ff5089c5fe23e21e21084a6/charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4", size = 102139 }, { url = "https://files.pythonhosted.org/packages/7f/c0/b913f8f02836ed9ab32ea643c6fe4d3325c3d8627cf6e78098671cafff86/charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41", size = 197867 }, { url = "https://files.pythonhosted.org/packages/0f/6c/2bee440303d705b6fb1e2ec789543edec83d32d258299b16eed28aad48e0/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f", size = 141385 }, { url = "https://files.pythonhosted.org/packages/3d/04/cb42585f07f6f9fd3219ffb6f37d5a39b4fd2db2355b23683060029c35f7/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2", size = 151367 }, @@ -351,11 +275,9 @@ wheels = [ [[package]] name = "datafusion" -version = "44.0.0" source = { editable = "." } dependencies = [ - { name = "pyarrow", version = "17.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "pyarrow", version = "18.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "pyarrow" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] @@ -369,20 +291,16 @@ dev = [ { name = "toml" }, ] docs = [ - { name = "ipython", version = "8.12.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "ipython", version = "8.18.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "ipython", version = "8.18.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "ipython", version = "8.31.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "jinja2" }, { name = "myst-parser", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "myst-parser", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "pandas", version = "2.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "pandas", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "pandas" }, { name = "pickleshare" }, { name = "pydata-sphinx-theme" }, - { name = "setuptools", version = "75.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "setuptools", version = "75.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "setuptools" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "sphinx-autoapi" }, ] @@ -435,28 +353,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 }, ] -[[package]] -name = "docutils" -version = "0.20.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/1f/53/a5da4f2c5739cf66290fac1431ee52aff6851c7c8ffd8264f13affd7bcdd/docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b", size = 2058365 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/26/87/f238c0670b94533ac0353a4e2a1a771a0cc73277b88bff23d3ae35a256c1/docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6", size = 572666 }, -] - [[package]] name = "docutils" version = "0.21.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 } wheels = [ { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 }, @@ -503,8 +403,7 @@ name = "importlib-metadata" version = "8.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp", version = "3.20.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "zipp", version = "3.21.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "zipp", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 } wheels = [ @@ -520,52 +419,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, ] -[[package]] -name = "ipython" -version = "8.12.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "appnope", marker = "python_full_version < '3.9' and sys_platform == 'darwin'" }, - { name = "backcall", marker = "python_full_version < '3.9'" }, - { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" }, - { name = "decorator", marker = "python_full_version < '3.9'" }, - { name = "jedi", marker = "python_full_version < '3.9'" }, - { name = "matplotlib-inline", marker = "python_full_version < '3.9'" }, - { name = "pexpect", marker = "python_full_version < '3.9' and sys_platform != 'win32'" }, - { name = "pickleshare", marker = "python_full_version < '3.9'" }, - { name = "prompt-toolkit", marker = "python_full_version < '3.9'" }, - { name = "pygments", marker = "python_full_version < '3.9'" }, - { name = "stack-data", marker = "python_full_version < '3.9'" }, - { name = "traitlets", marker = "python_full_version < '3.9'" }, - { name = "typing-extensions", marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/6a/44ef299b1762f5a73841e87fae8a73a8cc8aee538d6dc8c77a5afe1fd2ce/ipython-8.12.3.tar.gz", hash = "sha256:3910c4b54543c2ad73d06579aa771041b7d5707b033bd488669b4cf544e3b363", size = 5470171 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/97/8fe103906cd81bc42d3b0175b5534a9f67dccae47d6451131cf8d0d70bb2/ipython-8.12.3-py3-none-any.whl", hash = "sha256:b0340d46a933d27c657b211a329d0be23793c36595acf9e6ef4164bc01a1804c", size = 798307 }, -] - [[package]] name = "ipython" version = "8.18.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "colorama", marker = "python_full_version == '3.9.*' and sys_platform == 'win32'" }, - { name = "decorator", marker = "python_full_version == '3.9.*'" }, - { name = "exceptiongroup", marker = "python_full_version == '3.9.*'" }, - { name = "jedi", marker = "python_full_version == '3.9.*'" }, - { name = "matplotlib-inline", marker = "python_full_version == '3.9.*'" }, - { name = "pexpect", marker = "python_full_version == '3.9.*' and sys_platform != 'win32'" }, - { name = "prompt-toolkit", marker = "python_full_version == '3.9.*'" }, - { name = "pygments", marker = "python_full_version == '3.9.*'" }, - { name = "stack-data", marker = "python_full_version == '3.9.*'" }, - { name = "traitlets", marker = "python_full_version == '3.9.*'" }, - { name = "typing-extensions", marker = "python_full_version == '3.9.*'" }, + { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" }, + { name = "decorator", marker = "python_full_version < '3.10'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.10'" }, + { name = "jedi", marker = "python_full_version < '3.10'" }, + { name = "matplotlib-inline", marker = "python_full_version < '3.10'" }, + { name = "pexpect", marker = "python_full_version < '3.10' and sys_platform != 'win32'" }, + { name = "prompt-toolkit", marker = "python_full_version < '3.10'" }, + { name = "pygments", marker = "python_full_version < '3.10'" }, + { name = "stack-data", marker = "python_full_version < '3.10'" }, + { name = "traitlets", marker = "python_full_version < '3.10'" }, + { name = "typing-extensions", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b1/b9/3ba6c45a6df813c09a48bac313c22ff83efa26cbb55011218d925a46e2ad/ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27", size = 5486330 } wheels = [ @@ -616,8 +488,7 @@ name = "jinja2" version = "3.1.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markupsafe", version = "2.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "markupsafe", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "markupsafe" }, ] sdist = { url = "https://files.pythonhosted.org/packages/af/92/b3130cbbf5591acf9ade8708c365f3238046ac7cb8ccba6e81abccb0ccff/jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb", size = 244674 } wheels = [ @@ -636,77 +507,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 }, ] -[[package]] -name = "markupsafe" -version = "2.1.5" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/87/5b/aae44c6655f3801e81aa3eef09dbbf012431987ba564d7231722f68df02d/MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b", size = 19384 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e4/54/ad5eb37bf9d51800010a74e4665425831a9db4e7c4e0fde4352e391e808e/MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc", size = 18206 }, - { url = "https://files.pythonhosted.org/packages/6a/4a/a4d49415e600bacae038c67f9fecc1d5433b9d3c71a4de6f33537b89654c/MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5", size = 14079 }, - { url = "https://files.pythonhosted.org/packages/0a/7b/85681ae3c33c385b10ac0f8dd025c30af83c78cec1c37a6aa3b55e67f5ec/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46", size = 26620 }, - { url = "https://files.pythonhosted.org/packages/7c/52/2b1b570f6b8b803cef5ac28fdf78c0da318916c7d2fe9402a84d591b394c/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f", size = 25818 }, - { url = "https://files.pythonhosted.org/packages/29/fe/a36ba8c7ca55621620b2d7c585313efd10729e63ef81e4e61f52330da781/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900", size = 25493 }, - { url = "https://files.pythonhosted.org/packages/60/ae/9c60231cdfda003434e8bd27282b1f4e197ad5a710c14bee8bea8a9ca4f0/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff", size = 30630 }, - { url = "https://files.pythonhosted.org/packages/65/dc/1510be4d179869f5dafe071aecb3f1f41b45d37c02329dfba01ff59e5ac5/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad", size = 29745 }, - { url = "https://files.pythonhosted.org/packages/30/39/8d845dd7d0b0613d86e0ef89549bfb5f61ed781f59af45fc96496e897f3a/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd", size = 30021 }, - { url = "https://files.pythonhosted.org/packages/c7/5c/356a6f62e4f3c5fbf2602b4771376af22a3b16efa74eb8716fb4e328e01e/MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4", size = 16659 }, - { url = "https://files.pythonhosted.org/packages/69/48/acbf292615c65f0604a0c6fc402ce6d8c991276e16c80c46a8f758fbd30c/MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5", size = 17213 }, - { url = "https://files.pythonhosted.org/packages/11/e7/291e55127bb2ae67c64d66cef01432b5933859dfb7d6949daa721b89d0b3/MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f", size = 18219 }, - { url = "https://files.pythonhosted.org/packages/6b/cb/aed7a284c00dfa7c0682d14df85ad4955a350a21d2e3b06d8240497359bf/MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2", size = 14098 }, - { url = "https://files.pythonhosted.org/packages/1c/cf/35fe557e53709e93feb65575c93927942087e9b97213eabc3fe9d5b25a55/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced", size = 29014 }, - { url = "https://files.pythonhosted.org/packages/97/18/c30da5e7a0e7f4603abfc6780574131221d9148f323752c2755d48abad30/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5", size = 28220 }, - { url = "https://files.pythonhosted.org/packages/0c/40/2e73e7d532d030b1e41180807a80d564eda53babaf04d65e15c1cf897e40/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c", size = 27756 }, - { url = "https://files.pythonhosted.org/packages/18/46/5dca760547e8c59c5311b332f70605d24c99d1303dd9a6e1fc3ed0d73561/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f", size = 33988 }, - { url = "https://files.pythonhosted.org/packages/6d/c5/27febe918ac36397919cd4a67d5579cbbfa8da027fa1238af6285bb368ea/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a", size = 32718 }, - { url = "https://files.pythonhosted.org/packages/f8/81/56e567126a2c2bc2684d6391332e357589a96a76cb9f8e5052d85cb0ead8/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f", size = 33317 }, - { url = "https://files.pythonhosted.org/packages/00/0b/23f4b2470accb53285c613a3ab9ec19dc944eaf53592cb6d9e2af8aa24cc/MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906", size = 16670 }, - { url = "https://files.pythonhosted.org/packages/b7/a2/c78a06a9ec6d04b3445a949615c4c7ed86a0b2eb68e44e7541b9d57067cc/MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617", size = 17224 }, - { url = "https://files.pythonhosted.org/packages/53/bd/583bf3e4c8d6a321938c13f49d44024dbe5ed63e0a7ba127e454a66da974/MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1", size = 18215 }, - { url = "https://files.pythonhosted.org/packages/48/d6/e7cd795fc710292c3af3a06d80868ce4b02bfbbf370b7cee11d282815a2a/MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4", size = 14069 }, - { url = "https://files.pythonhosted.org/packages/51/b5/5d8ec796e2a08fc814a2c7d2584b55f889a55cf17dd1a90f2beb70744e5c/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee", size = 29452 }, - { url = "https://files.pythonhosted.org/packages/0a/0d/2454f072fae3b5a137c119abf15465d1771319dfe9e4acbb31722a0fff91/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5", size = 28462 }, - { url = "https://files.pythonhosted.org/packages/2d/75/fd6cb2e68780f72d47e6671840ca517bda5ef663d30ada7616b0462ad1e3/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b", size = 27869 }, - { url = "https://files.pythonhosted.org/packages/b0/81/147c477391c2750e8fc7705829f7351cf1cd3be64406edcf900dc633feb2/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a", size = 33906 }, - { url = "https://files.pythonhosted.org/packages/8b/ff/9a52b71839d7a256b563e85d11050e307121000dcebc97df120176b3ad93/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f", size = 32296 }, - { url = "https://files.pythonhosted.org/packages/88/07/2dc76aa51b481eb96a4c3198894f38b480490e834479611a4053fbf08623/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169", size = 33038 }, - { url = "https://files.pythonhosted.org/packages/96/0c/620c1fb3661858c0e37eb3cbffd8c6f732a67cd97296f725789679801b31/MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad", size = 16572 }, - { url = "https://files.pythonhosted.org/packages/3f/14/c3554d512d5f9100a95e737502f4a2323a1959f6d0d01e0d0997b35f7b10/MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb", size = 17127 }, - { url = "https://files.pythonhosted.org/packages/f8/ff/2c942a82c35a49df5de3a630ce0a8456ac2969691b230e530ac12314364c/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a", size = 18192 }, - { url = "https://files.pythonhosted.org/packages/4f/14/6f294b9c4f969d0c801a4615e221c1e084722ea6114ab2114189c5b8cbe0/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46", size = 14072 }, - { url = "https://files.pythonhosted.org/packages/81/d4/fd74714ed30a1dedd0b82427c02fa4deec64f173831ec716da11c51a50aa/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532", size = 26928 }, - { url = "https://files.pythonhosted.org/packages/c7/bd/50319665ce81bb10e90d1cf76f9e1aa269ea6f7fa30ab4521f14d122a3df/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab", size = 26106 }, - { url = "https://files.pythonhosted.org/packages/4c/6f/f2b0f675635b05f6afd5ea03c094557bdb8622fa8e673387444fe8d8e787/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68", size = 25781 }, - { url = "https://files.pythonhosted.org/packages/51/e0/393467cf899b34a9d3678e78961c2c8cdf49fb902a959ba54ece01273fb1/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0", size = 30518 }, - { url = "https://files.pythonhosted.org/packages/f6/02/5437e2ad33047290dafced9df741d9efc3e716b75583bbd73a9984f1b6f7/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4", size = 29669 }, - { url = "https://files.pythonhosted.org/packages/0e/7d/968284145ffd9d726183ed6237c77938c021abacde4e073020f920e060b2/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3", size = 29933 }, - { url = "https://files.pythonhosted.org/packages/bf/f3/ecb00fc8ab02b7beae8699f34db9357ae49d9f21d4d3de6f305f34fa949e/MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff", size = 16656 }, - { url = "https://files.pythonhosted.org/packages/92/21/357205f03514a49b293e214ac39de01fadd0970a6e05e4bf1ddd0ffd0881/MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029", size = 17206 }, - { url = "https://files.pythonhosted.org/packages/0f/31/780bb297db036ba7b7bbede5e1d7f1e14d704ad4beb3ce53fb495d22bc62/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf", size = 18193 }, - { url = "https://files.pythonhosted.org/packages/6c/77/d77701bbef72892affe060cdacb7a2ed7fd68dae3b477a8642f15ad3b132/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2", size = 14073 }, - { url = "https://files.pythonhosted.org/packages/d9/a7/1e558b4f78454c8a3a0199292d96159eb4d091f983bc35ef258314fe7269/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8", size = 26486 }, - { url = "https://files.pythonhosted.org/packages/5f/5a/360da85076688755ea0cceb92472923086993e86b5613bbae9fbc14136b0/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3", size = 25685 }, - { url = "https://files.pythonhosted.org/packages/6a/18/ae5a258e3401f9b8312f92b028c54d7026a97ec3ab20bfaddbdfa7d8cce8/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465", size = 25338 }, - { url = "https://files.pythonhosted.org/packages/0b/cc/48206bd61c5b9d0129f4d75243b156929b04c94c09041321456fd06a876d/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e", size = 30439 }, - { url = "https://files.pythonhosted.org/packages/d1/06/a41c112ab9ffdeeb5f77bc3e331fdadf97fa65e52e44ba31880f4e7f983c/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea", size = 29531 }, - { url = "https://files.pythonhosted.org/packages/02/8c/ab9a463301a50dab04d5472e998acbd4080597abc048166ded5c7aa768c8/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6", size = 29823 }, - { url = "https://files.pythonhosted.org/packages/bc/29/9bc18da763496b055d8e98ce476c8e718dcfd78157e17f555ce6dd7d0895/MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf", size = 16658 }, - { url = "https://files.pythonhosted.org/packages/f6/f8/4da07de16f10551ca1f640c92b5f316f9394088b183c6a57183df6de5ae4/MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5", size = 17211 }, -] - [[package]] name = "markupsafe" version = "3.0.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537 } wheels = [ { url = "https://files.pythonhosted.org/packages/04/90/d08277ce111dd22f77149fd1a5d4653eeb3b3eaacbdfcbae5afb2600eebd/MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8", size = 14357 }, @@ -832,18 +636,15 @@ name = "myst-parser" version = "3.0.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", - "python_full_version < '3.9'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "docutils", marker = "python_full_version < '3.10'" }, { name = "jinja2", marker = "python_full_version < '3.10'" }, { name = "markdown-it-py", marker = "python_full_version < '3.10'" }, { name = "mdit-py-plugins", marker = "python_full_version < '3.10'" }, { name = "pyyaml", marker = "python_full_version < '3.10'" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/49/64/e2f13dac02f599980798c01156393b781aec983b52a6e4057ee58f07c43a/myst_parser-3.0.1.tar.gz", hash = "sha256:88f0cb406cb363b077d176b51c476f62d60604d68a8dcdf4832e080441301a87", size = 92392 } wheels = [ @@ -860,7 +661,7 @@ resolution-markers = [ "python_full_version == '3.10.*'", ] dependencies = [ - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "docutils", marker = "python_full_version >= '3.10'" }, { name = "jinja2", marker = "python_full_version >= '3.10'" }, { name = "markdown-it-py", marker = "python_full_version >= '3.10'" }, { name = "mdit-py-plugins", marker = "python_full_version >= '3.10'" }, @@ -872,50 +673,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ca/b4/b036f8fdb667587bb37df29dc6644681dd78b7a2a6321a34684b79412b28/myst_parser-4.0.0-py3-none-any.whl", hash = "sha256:b9317997552424448c6096c2558872fdb6f81d3ecb3a40ce84a7518798f3f28d", size = 84563 }, ] -[[package]] -name = "numpy" -version = "1.24.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a4/9b/027bec52c633f6556dba6b722d9a0befb40498b9ceddd29cbe67a45a127c/numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463", size = 10911229 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/80/6cdfb3e275d95155a34659163b83c09e3a3ff9f1456880bec6cc63d71083/numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64", size = 19789140 }, - { url = "https://files.pythonhosted.org/packages/64/5f/3f01d753e2175cfade1013eea08db99ba1ee4bdb147ebcf3623b75d12aa7/numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1", size = 13854297 }, - { url = "https://files.pythonhosted.org/packages/5a/b3/2f9c21d799fa07053ffa151faccdceeb69beec5a010576b8991f614021f7/numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4", size = 13995611 }, - { url = "https://files.pythonhosted.org/packages/10/be/ae5bf4737cb79ba437879915791f6f26d92583c738d7d960ad94e5c36adf/numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6", size = 17282357 }, - { url = "https://files.pythonhosted.org/packages/c0/64/908c1087be6285f40e4b3e79454552a701664a079321cff519d8c7051d06/numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc", size = 12429222 }, - { url = "https://files.pythonhosted.org/packages/22/55/3d5a7c1142e0d9329ad27cece17933b0e2ab4e54ddc5c1861fbfeb3f7693/numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e", size = 14841514 }, - { url = "https://files.pythonhosted.org/packages/a9/cc/5ed2280a27e5dab12994c884f1f4d8c3bd4d885d02ae9e52a9d213a6a5e2/numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810", size = 19775508 }, - { url = "https://files.pythonhosted.org/packages/c0/bc/77635c657a3668cf652806210b8662e1aff84b818a55ba88257abf6637a8/numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254", size = 13840033 }, - { url = "https://files.pythonhosted.org/packages/a7/4c/96cdaa34f54c05e97c1c50f39f98d608f96f0677a6589e64e53104e22904/numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7", size = 13991951 }, - { url = "https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5", size = 17278923 }, - { url = "https://files.pythonhosted.org/packages/35/e2/76a11e54139654a324d107da1d98f99e7aa2a7ef97cfd7c631fba7dbde71/numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d", size = 12422446 }, - { url = "https://files.pythonhosted.org/packages/d8/ec/ebef2f7d7c28503f958f0f8b992e7ce606fb74f9e891199329d5f5f87404/numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694", size = 14834466 }, - { url = "https://files.pythonhosted.org/packages/11/10/943cfb579f1a02909ff96464c69893b1d25be3731b5d3652c2e0cf1281ea/numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61", size = 19780722 }, - { url = "https://files.pythonhosted.org/packages/a7/ae/f53b7b265fdc701e663fbb322a8e9d4b14d9cb7b2385f45ddfabfc4327e4/numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f", size = 13843102 }, - { url = "https://files.pythonhosted.org/packages/25/6f/2586a50ad72e8dbb1d8381f837008a0321a3516dfd7cb57fc8cf7e4bb06b/numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e", size = 14039616 }, - { url = "https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc", size = 17316263 }, - { url = "https://files.pythonhosted.org/packages/d1/57/8d328f0b91c733aa9aa7ee540dbc49b58796c862b4fbcb1146c701e888da/numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2", size = 12455660 }, - { url = "https://files.pythonhosted.org/packages/69/65/0d47953afa0ad569d12de5f65d964321c208492064c38fe3b0b9744f8d44/numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706", size = 14868112 }, - { url = "https://files.pythonhosted.org/packages/9a/cd/d5b0402b801c8a8b56b04c1e85c6165efab298d2f0ab741c2406516ede3a/numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400", size = 19816549 }, - { url = "https://files.pythonhosted.org/packages/14/27/638aaa446f39113a3ed38b37a66243e21b38110d021bfcb940c383e120f2/numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f", size = 13879950 }, - { url = "https://files.pythonhosted.org/packages/8f/27/91894916e50627476cff1a4e4363ab6179d01077d71b9afed41d9e1f18bf/numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9", size = 14030228 }, - { url = "https://files.pythonhosted.org/packages/7a/7c/d7b2a0417af6428440c0ad7cb9799073e507b1a465f827d058b826236964/numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d", size = 17311170 }, - { url = "https://files.pythonhosted.org/packages/18/9d/e02ace5d7dfccee796c37b995c63322674daf88ae2f4a4724c5dd0afcc91/numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835", size = 12454918 }, - { url = "https://files.pythonhosted.org/packages/63/38/6cc19d6b8bfa1d1a459daf2b3fe325453153ca7019976274b6f33d8b5663/numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8", size = 14867441 }, - { url = "https://files.pythonhosted.org/packages/a4/fd/8dff40e25e937c94257455c237b9b6bf5a30d42dd1cc11555533be099492/numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef", size = 19156590 }, - { url = "https://files.pythonhosted.org/packages/42/e7/4bf953c6e05df90c6d351af69966384fed8e988d0e8c54dad7103b59f3ba/numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a", size = 16705744 }, - { url = "https://files.pythonhosted.org/packages/fc/dd/9106005eb477d022b60b3817ed5937a43dad8fd1f20b0610ea8a32fcb407/numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2", size = 14734290 }, -] - [[package]] name = "numpy" version = "2.0.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015 } wheels = [ @@ -1041,63 +804,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, ] -[[package]] -name = "pandas" -version = "2.0.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "python-dateutil", marker = "python_full_version < '3.9'" }, - { name = "pytz", marker = "python_full_version < '3.9'" }, - { name = "tzdata", marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/a7/824332581e258b5aa4f3763ecb2a797e5f9a54269044ba2e50ac19936b32/pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c", size = 5284455 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/b2/0d4a5729ce1ce11630c4fc5d5522a33b967b3ca146c210f58efde7c40e99/pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8", size = 11760908 }, - { url = "https://files.pythonhosted.org/packages/4a/f6/f620ca62365d83e663a255a41b08d2fc2eaf304e0b8b21bb6d62a7390fe3/pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f", size = 10823486 }, - { url = "https://files.pythonhosted.org/packages/c2/59/cb4234bc9b968c57e81861b306b10cd8170272c57b098b724d3de5eda124/pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183", size = 11571897 }, - { url = "https://files.pythonhosted.org/packages/e3/59/35a2892bf09ded9c1bf3804461efe772836a5261ef5dfb4e264ce813ff99/pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0", size = 12306421 }, - { url = "https://files.pythonhosted.org/packages/94/71/3a0c25433c54bb29b48e3155b959ac78f4c4f2f06f94d8318aac612cb80f/pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210", size = 9540792 }, - { url = "https://files.pythonhosted.org/packages/ed/30/b97456e7063edac0e5a405128065f0cd2033adfe3716fb2256c186bd41d0/pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e", size = 10664333 }, - { url = "https://files.pythonhosted.org/packages/b3/92/a5e5133421b49e901a12e02a6a7ef3a0130e10d13db8cb657fdd0cba3b90/pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8", size = 11645672 }, - { url = "https://files.pythonhosted.org/packages/8f/bb/aea1fbeed5b474cb8634364718abe9030d7cc7a30bf51f40bd494bbc89a2/pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26", size = 10693229 }, - { url = "https://files.pythonhosted.org/packages/d6/90/e7d387f1a416b14e59290baa7a454a90d719baebbf77433ff1bdcc727800/pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d", size = 11581591 }, - { url = "https://files.pythonhosted.org/packages/d0/28/88b81881c056376254618fad622a5e94b5126db8c61157ea1910cd1c040a/pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df", size = 12219370 }, - { url = "https://files.pythonhosted.org/packages/e4/a5/212b9039e25bf8ebb97e417a96660e3dc925dacd3f8653d531b8f7fd9be4/pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd", size = 9482935 }, - { url = "https://files.pythonhosted.org/packages/9e/71/756a1be6bee0209d8c0d8c5e3b9fc72c00373f384a4017095ec404aec3ad/pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b", size = 10607692 }, - { url = "https://files.pythonhosted.org/packages/78/a8/07dd10f90ca915ed914853cd57f79bfc22e1ef4384ab56cb4336d2fc1f2a/pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061", size = 11653303 }, - { url = "https://files.pythonhosted.org/packages/53/c3/f8e87361f7fdf42012def602bfa2a593423c729f5cb7c97aed7f51be66ac/pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5", size = 10710932 }, - { url = "https://files.pythonhosted.org/packages/a7/87/828d50c81ce0f434163bf70b925a0eec6076808e0bca312a79322b141f66/pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089", size = 11684018 }, - { url = "https://files.pythonhosted.org/packages/f8/7f/5b047effafbdd34e52c9e2d7e44f729a0655efafb22198c45cf692cdc157/pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0", size = 12353723 }, - { url = "https://files.pythonhosted.org/packages/ea/ae/26a2eda7fa581347d69e51f93892493b2074ef3352ac71033c9f32c52389/pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02", size = 9646403 }, - { url = "https://files.pythonhosted.org/packages/c3/6c/ea362eef61f05553aaf1a24b3e96b2d0603f5dc71a3bd35688a24ed88843/pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78", size = 10777638 }, - { url = "https://files.pythonhosted.org/packages/f8/c7/cfef920b7b457dff6928e824896cb82367650ea127d048ee0b820026db4f/pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b", size = 11834160 }, - { url = "https://files.pythonhosted.org/packages/6c/1c/689c9d99bc4e5d366a5fd871f0bcdee98a6581e240f96b78d2d08f103774/pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e", size = 10862752 }, - { url = "https://files.pythonhosted.org/packages/cc/b8/4d082f41c27c95bf90485d1447b647cc7e5680fea75e315669dc6e4cb398/pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b", size = 11715852 }, - { url = "https://files.pythonhosted.org/packages/9e/0d/91a9fd2c202f2b1d97a38ab591890f86480ecbb596cbc56d035f6f23fdcc/pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641", size = 12398496 }, - { url = "https://files.pythonhosted.org/packages/26/7d/d8aa0a2c4f3f5f8ea59fb946c8eafe8f508090ca73e2b08a9af853c1103e/pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682", size = 9630766 }, - { url = "https://files.pythonhosted.org/packages/9a/f2/0ad053856debbe90c83de1b4f05915f85fd2146f20faf9daa3b320d36df3/pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc", size = 10755902 }, -] - [[package]] name = "pandas" version = "2.2.3" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "python-dateutil", marker = "python_full_version >= '3.9'" }, - { name = "pytz", marker = "python_full_version >= '3.9'" }, - { name = "tzdata", marker = "python_full_version >= '3.9'" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, ] sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } wheels = [ @@ -1213,65 +929,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 }, ] -[[package]] -name = "pyarrow" -version = "17.0.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/27/4e/ea6d43f324169f8aec0e57569443a38bab4b398d09769ca64f7b4d467de3/pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28", size = 1112479 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/39/5d/78d4b040bc5ff2fc6c3d03e80fca396b742f6c125b8af06bcf7427f931bc/pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07", size = 28994846 }, - { url = "https://files.pythonhosted.org/packages/3b/73/8ed168db7642e91180330e4ea9f3ff8bab404678f00d32d7df0871a4933b/pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655", size = 27165908 }, - { url = "https://files.pythonhosted.org/packages/81/36/e78c24be99242063f6d0590ef68c857ea07bdea470242c361e9a15bd57a4/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545", size = 39264209 }, - { url = "https://files.pythonhosted.org/packages/18/4c/3db637d7578f683b0a8fb8999b436bdbedd6e3517bd4f90c70853cf3ad20/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2", size = 39862883 }, - { url = "https://files.pythonhosted.org/packages/81/3c/0580626896c842614a523e66b351181ed5bb14e5dfc263cd68cea2c46d90/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8", size = 38723009 }, - { url = "https://files.pythonhosted.org/packages/ee/fb/c1b47f0ada36d856a352da261a44d7344d8f22e2f7db3945f8c3b81be5dd/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047", size = 39855626 }, - { url = "https://files.pythonhosted.org/packages/19/09/b0a02908180a25d57312ab5919069c39fddf30602568980419f4b02393f6/pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087", size = 25147242 }, - { url = "https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977", size = 29028748 }, - { url = "https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3", size = 27190965 }, - { url = "https://files.pythonhosted.org/packages/3b/c8/5675719570eb1acd809481c6d64e2136ffb340bc387f4ca62dce79516cea/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15", size = 39269081 }, - { url = "https://files.pythonhosted.org/packages/5e/78/3931194f16ab681ebb87ad252e7b8d2c8b23dad49706cadc865dff4a1dd3/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597", size = 39864921 }, - { url = "https://files.pythonhosted.org/packages/d8/81/69b6606093363f55a2a574c018901c40952d4e902e670656d18213c71ad7/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420", size = 38740798 }, - { url = "https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4", size = 39871877 }, - { url = "https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03", size = 25151089 }, - { url = "https://files.pythonhosted.org/packages/d4/62/ce6ac1275a432b4a27c55fe96c58147f111d8ba1ad800a112d31859fae2f/pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22", size = 29019418 }, - { url = "https://files.pythonhosted.org/packages/8e/0a/dbd0c134e7a0c30bea439675cc120012337202e5fac7163ba839aa3691d2/pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053", size = 27152197 }, - { url = "https://files.pythonhosted.org/packages/cb/05/3f4a16498349db79090767620d6dc23c1ec0c658a668d61d76b87706c65d/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a", size = 39263026 }, - { url = "https://files.pythonhosted.org/packages/c2/0c/ea2107236740be8fa0e0d4a293a095c9f43546a2465bb7df34eee9126b09/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc", size = 39880798 }, - { url = "https://files.pythonhosted.org/packages/f6/b0/b9164a8bc495083c10c281cc65064553ec87b7537d6f742a89d5953a2a3e/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a", size = 38715172 }, - { url = "https://files.pythonhosted.org/packages/f1/c4/9625418a1413005e486c006e56675334929fad864347c5ae7c1b2e7fe639/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b", size = 39874508 }, - { url = "https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7", size = 25099235 }, - { url = "https://files.pythonhosted.org/packages/8d/bd/8f52c1d7b430260f80a349cffa2df351750a737b5336313d56dcadeb9ae1/pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204", size = 28999345 }, - { url = "https://files.pythonhosted.org/packages/64/d9/51e35550f2f18b8815a2ab25948f735434db32000c0e91eba3a32634782a/pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8", size = 27168441 }, - { url = "https://files.pythonhosted.org/packages/18/d8/7161d87d07ea51be70c49f615004c1446d5723622a18b2681f7e4b71bf6e/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155", size = 39363163 }, - { url = "https://files.pythonhosted.org/packages/3f/08/bc497130789833de09e345e3ce4647e3ce86517c4f70f2144f0367ca378b/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145", size = 39965253 }, - { url = "https://files.pythonhosted.org/packages/d3/2e/493dd7db889402b4c7871ca7dfdd20f2c5deedbff802d3eb8576359930f9/pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c", size = 38805378 }, - { url = "https://files.pythonhosted.org/packages/e6/c1/4c6bcdf7a820034aa91a8b4d25fef38809be79b42ca7aaa16d4680b0bbac/pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c", size = 39958364 }, - { url = "https://files.pythonhosted.org/packages/d1/db/42ac644453cfdfc60fe002b46d647fe7a6dfad753ef7b28e99b4c936ad5d/pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca", size = 25229211 }, - { url = "https://files.pythonhosted.org/packages/43/e0/a898096d35be240aa61fb2d54db58b86d664b10e1e51256f9300f47565e8/pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb", size = 29007881 }, - { url = "https://files.pythonhosted.org/packages/59/22/f7d14907ed0697b5dd488d393129f2738629fa5bcba863e00931b7975946/pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df", size = 27178117 }, - { url = "https://files.pythonhosted.org/packages/bf/ee/661211feac0ed48467b1d5c57298c91403809ec3ab78b1d175e1d6ad03cf/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687", size = 39273896 }, - { url = "https://files.pythonhosted.org/packages/af/61/bcd9b58e38ead6ad42b9ed00da33a3f862bc1d445e3d3164799c25550ac2/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b", size = 39875438 }, - { url = "https://files.pythonhosted.org/packages/75/63/29d1bfcc57af73cde3fc3baccab2f37548de512dbe0ab294b033cd203516/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5", size = 38735092 }, - { url = "https://files.pythonhosted.org/packages/39/f4/90258b4de753df7cc61cefb0312f8abcf226672e96cc64996e66afce817a/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda", size = 39867610 }, - { url = "https://files.pythonhosted.org/packages/e7/f6/b75d4816c32f1618ed31a005ee635dd1d91d8164495d94f2ea092f594661/pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204", size = 25148611 }, -] - [[package]] name = "pyarrow" version = "18.1.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/7f/7b/640785a9062bb00314caa8a387abce547d2a420cf09bd6c715fe659ccffb/pyarrow-18.1.0.tar.gz", hash = "sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73", size = 1118671 } wheels = [ { url = "https://files.pythonhosted.org/packages/1a/bb/8d4a1573f66e0684f190dd2b55fd0b97a7214de8882d58a3867e777bf640/pyarrow-18.1.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e21488d5cfd3d8b500b3238a6c4b075efabc18f0f6d80b29239737ebd69caa6c", size = 29531620 }, @@ -1332,10 +993,8 @@ version = "0.8.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "beautifulsoup4" }, - { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "docutils" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/fc/d6/3921de802cf1ee771f0e76c9068b52498aeb8eeec6b830ff931c81c7ecf3/pydata_sphinx_theme-0.8.0.tar.gz", hash = "sha256:9f72015d9c572ea92e3007ab221a8325767c426783b6b9941813e65fa988dc90", size = 1123746 } @@ -1349,13 +1008,11 @@ version = "2.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "deprecated" }, - { name = "pyjwt", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version < '3.9'" }, - { name = "pyjwt", version = "2.10.1", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version >= '3.9'" }, + { name = "pyjwt", extra = ["crypto"] }, { name = "pynacl" }, { name = "requests" }, { name = "typing-extensions" }, - { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/16/ce/aa91d30040d9552c274e7ea8bd10a977600d508d579a4bb262b95eccf961/pygithub-2.5.0.tar.gz", hash = "sha256:e1613ac508a9be710920d26eb18b1905ebd9926aa49398e88151c1b526aad3cf", size = 3552804 } wheels = [ @@ -1371,33 +1028,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, ] -[[package]] -name = "pyjwt" -version = "2.9.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/fb/68/ce067f09fca4abeca8771fe667d89cc347d1e99da3e093112ac329c6020e/pyjwt-2.9.0.tar.gz", hash = "sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c", size = 78825 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/79/84/0fdf9b18ba31d69877bd39c9cd6052b47f3761e9910c15de788e519f079f/PyJWT-2.9.0-py3-none-any.whl", hash = "sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850", size = 22344 }, -] - -[package.optional-dependencies] -crypto = [ - { name = "cryptography", marker = "python_full_version < '3.9'" }, -] - [[package]] name = "pyjwt" version = "2.10.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/e7/46/bd74733ff231675599650d3e47f361794b22ef3e3770998dda30d3b63726/pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953", size = 87785 } wheels = [ { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997 }, @@ -1405,7 +1039,7 @@ wheels = [ [package.optional-dependencies] crypto = [ - { name = "cryptography", marker = "python_full_version >= '3.9'" }, + { name = "cryptography" }, ] [[package]] @@ -1508,13 +1142,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597 }, { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527 }, { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446 }, - { url = "https://files.pythonhosted.org/packages/74/d9/323a59d506f12f498c2097488d80d16f4cf965cee1791eab58b56b19f47a/PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a", size = 183218 }, - { url = "https://files.pythonhosted.org/packages/74/cc/20c34d00f04d785f2028737e2e2a8254e1425102e730fee1d6396f832577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5", size = 728067 }, - { url = "https://files.pythonhosted.org/packages/20/52/551c69ca1501d21c0de51ddafa8c23a0191ef296ff098e98358f69080577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d", size = 757812 }, - { url = "https://files.pythonhosted.org/packages/fd/7f/2c3697bba5d4aa5cc2afe81826d73dfae5f049458e44732c7a0938baa673/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083", size = 746531 }, - { url = "https://files.pythonhosted.org/packages/8c/ab/6226d3df99900e580091bb44258fde77a8433511a86883bd4681ea19a858/PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706", size = 800820 }, - { url = "https://files.pythonhosted.org/packages/a0/99/a9eb0f3e710c06c5d922026f6736e920d431812ace24aae38228d0d64b04/PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a", size = 145514 }, - { url = "https://files.pythonhosted.org/packages/75/8a/ee831ad5fafa4431099aa4e078d4c8efd43cd5e48fbc774641d233b683a9/PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff", size = 162702 }, { url = "https://files.pythonhosted.org/packages/65/d8/b7a1db13636d7fb7d4ff431593c510c8b8fca920ade06ca8ef20015493c5/PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", size = 184777 }, { url = "https://files.pythonhosted.org/packages/0a/02/6ec546cd45143fdf9840b2c6be8d875116a64076218b61d68e12548e5839/PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", size = 172318 }, { url = "https://files.pythonhosted.org/packages/0e/9a/8cc68be846c972bda34f6c2a93abb644fb2476f4dcc924d52175786932c9/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", size = 720891 }, @@ -1534,8 +1161,7 @@ dependencies = [ { name = "certifi" }, { name = "charset-normalizer" }, { name = "idna" }, - { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } wheels = [ @@ -1567,28 +1193,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/94/0498cdb7316ed67a1928300dd87d659c933479f44dec51b4f62bfd1f8028/ruff-0.9.1-py3-none-win_arm64.whl", hash = "sha256:1cd76c7f9c679e6e8f2af8f778367dca82b95009bc7b1a85a47f1521ae524fa7", size = 9145708 }, ] -[[package]] -name = "setuptools" -version = "75.3.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/ed/22/a438e0caa4576f8c383fa4d35f1cc01655a46c75be358960d815bfbb12bd/setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686", size = 1351577 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/90/12/282ee9bce8b58130cb762fbc9beabd531549952cac11fc56add11dcb7ea0/setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd", size = 1251070 }, -] - [[package]] name = "setuptools" version = "75.8.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/92/ec/089608b791d210aec4e7f97488e67ab0d33add3efccb83a056cbafe3a2a6/setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6", size = 1343222 } wheels = [ { url = "https://files.pythonhosted.org/packages/69/8a/b9dc7678803429e4a3bc9ba462fa3dd9066824d3c607490235c6a796be5a/setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3", size = 1228782 }, @@ -1621,63 +1229,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 }, ] -[[package]] -name = "sphinx" -version = "7.1.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "alabaster", version = "0.7.13", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "babel", marker = "python_full_version < '3.9'" }, - { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "imagesize", marker = "python_full_version < '3.9'" }, - { name = "importlib-metadata", marker = "python_full_version < '3.9'" }, - { name = "jinja2", marker = "python_full_version < '3.9'" }, - { name = "packaging", marker = "python_full_version < '3.9'" }, - { name = "pygments", marker = "python_full_version < '3.9'" }, - { name = "requests", marker = "python_full_version < '3.9'" }, - { name = "snowballstemmer", marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-applehelp", version = "1.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-devhelp", version = "1.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-htmlhelp", version = "2.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-qthelp", version = "1.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-serializinghtml", version = "1.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/dc/01/688bdf9282241dca09fe6e3a1110eda399fa9b10d0672db609e37c2e7a39/sphinx-7.1.2.tar.gz", hash = "sha256:780f4d32f1d7d1126576e0e5ecc19dc32ab76cd24e950228dcf7b1f6d3d9e22f", size = 6828258 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/48/17/325cf6a257d84751a48ae90752b3d8fe0be8f9535b6253add61c49d0d9bc/sphinx-7.1.2-py3-none-any.whl", hash = "sha256:d170a81825b2fcacb6dfd5a0d7f578a053e45d3f2b153fecc948c37344eb4cbe", size = 3169543 }, -] - [[package]] name = "sphinx" version = "7.4.7" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "alabaster", version = "0.7.16", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "babel", marker = "python_full_version == '3.9.*'" }, - { name = "colorama", marker = "python_full_version == '3.9.*' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "imagesize", marker = "python_full_version == '3.9.*'" }, - { name = "importlib-metadata", marker = "python_full_version == '3.9.*'" }, - { name = "jinja2", marker = "python_full_version == '3.9.*'" }, - { name = "packaging", marker = "python_full_version == '3.9.*'" }, - { name = "pygments", marker = "python_full_version == '3.9.*'" }, - { name = "requests", marker = "python_full_version == '3.9.*'" }, - { name = "snowballstemmer", marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-applehelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-devhelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-htmlhelp", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-jsmath", marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-qthelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-serializinghtml", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "tomli", marker = "python_full_version == '3.9.*'" }, + { name = "alabaster", version = "0.7.16", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "babel", marker = "python_full_version < '3.10'" }, + { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" }, + { name = "docutils", marker = "python_full_version < '3.10'" }, + { name = "imagesize", marker = "python_full_version < '3.10'" }, + { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, + { name = "jinja2", marker = "python_full_version < '3.10'" }, + { name = "packaging", marker = "python_full_version < '3.10'" }, + { name = "pygments", marker = "python_full_version < '3.10'" }, + { name = "requests", marker = "python_full_version < '3.10'" }, + { name = "snowballstemmer", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.10'" }, + { name = "tomli", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5b/be/50e50cb4f2eff47df05673d361095cafd95521d2a22521b920c67a372dcb/sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe", size = 8067911 } wheels = [ @@ -1697,19 +1274,19 @@ dependencies = [ { name = "alabaster", version = "1.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "babel", marker = "python_full_version >= '3.10'" }, { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "docutils", marker = "python_full_version >= '3.10'" }, { name = "imagesize", marker = "python_full_version >= '3.10'" }, { name = "jinja2", marker = "python_full_version >= '3.10'" }, { name = "packaging", marker = "python_full_version >= '3.10'" }, { name = "pygments", marker = "python_full_version >= '3.10'" }, { name = "requests", marker = "python_full_version >= '3.10'" }, { name = "snowballstemmer", marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-applehelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-devhelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-htmlhelp", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.10'" }, { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-qthelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-serializinghtml", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.10'" }, { name = "tomli", marker = "python_full_version == '3.10.*'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611 } @@ -1722,97 +1299,40 @@ name = "sphinx-autoapi" version = "3.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "astroid", version = "3.2.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "astroid", version = "3.3.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "astroid" }, { name = "jinja2" }, { name = "pyyaml" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "stdlib-list", version = "0.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "stdlib-list", version = "0.11.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "stdlib-list", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/4a/eb/cc243583bb1d518ca3b10998c203d919a8ed90affd4831f2b61ad09043d2/sphinx_autoapi-3.4.0.tar.gz", hash = "sha256:e6d5371f9411bbb9fca358c00a9e57aef3ac94cbfc5df4bab285946462f69e0c", size = 29292 } wheels = [ { url = "https://files.pythonhosted.org/packages/de/d6/f2acdc2567337fd5f5dc091a4e58d8a0fb14927b9779fc1e5ecee96d9824/sphinx_autoapi-3.4.0-py3-none-any.whl", hash = "sha256:4027fef2875a22c5f2a57107c71641d82f6166bf55beb407a47aaf3ef14e7b92", size = 34095 }, ] -[[package]] -name = "sphinxcontrib-applehelp" -version = "1.0.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/32/df/45e827f4d7e7fcc84e853bcef1d836effd762d63ccb86f43ede4e98b478c/sphinxcontrib-applehelp-1.0.4.tar.gz", hash = "sha256:828f867945bbe39817c210a1abfd1bc4895c8b73fcaade56d45357a348a07d7e", size = 24766 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/06/c1/5e2cafbd03105ce50d8500f9b4e8a6e8d02e22d0475b574c3b3e9451a15f/sphinxcontrib_applehelp-1.0.4-py3-none-any.whl", hash = "sha256:29d341f67fb0f6f586b23ad80e072c8e6ad0b48417db2bde114a4c9746feb228", size = 120601 }, -] - [[package]] name = "sphinxcontrib-applehelp" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053 } wheels = [ { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300 }, ] -[[package]] -name = "sphinxcontrib-devhelp" -version = "1.0.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/98/33/dc28393f16385f722c893cb55539c641c9aaec8d1bc1c15b69ce0ac2dbb3/sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4", size = 17398 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/09/5de5ed43a521387f18bdf5f5af31d099605c992fd25372b2b9b825ce48ee/sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", size = 84690 }, -] - [[package]] name = "sphinxcontrib-devhelp" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967 } wheels = [ { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530 }, ] -[[package]] -name = "sphinxcontrib-htmlhelp" -version = "2.0.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/47/64cff68ea3aa450c373301e5bebfbb9fce0a3e70aca245fcadd4af06cd75/sphinxcontrib-htmlhelp-2.0.1.tar.gz", hash = "sha256:0cbdd302815330058422b98a113195c9249825d681e18f11e8b1f78a2f11efff", size = 27967 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/ee/a1f5e39046cbb5f8bc8fba87d1ddf1c6643fbc9194e58d26e606de4b9074/sphinxcontrib_htmlhelp-2.0.1-py3-none-any.whl", hash = "sha256:c38cb46dccf316c79de6e5515e1770414b797162b23cd3d06e67020e1d2a6903", size = 99833 }, -] - [[package]] name = "sphinxcontrib-htmlhelp" version = "2.1.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617 } wheels = [ { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705 }, @@ -1827,55 +1347,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071 }, ] -[[package]] -name = "sphinxcontrib-qthelp" -version = "1.0.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/8e/c4846e59f38a5f2b4a0e3b27af38f2fcf904d4bfd82095bf92de0b114ebd/sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", size = 21658 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2b/14/05f9206cf4e9cfca1afb5fd224c7cd434dcc3a433d6d9e4e0264d29c6cdb/sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6", size = 90609 }, -] - [[package]] name = "sphinxcontrib-qthelp" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165 } wheels = [ { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743 }, ] -[[package]] -name = "sphinxcontrib-serializinghtml" -version = "1.1.5" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b5/72/835d6fadb9e5d02304cf39b18f93d227cd93abd3c41ebf58e6853eeb1455/sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952", size = 21019 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/77/5464ec50dd0f1c1037e3c93249b040c8fc8078fdda97530eeb02424b6eea/sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", size = 94021 }, -] - [[package]] name = "sphinxcontrib-serializinghtml" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080 } wheels = [ { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072 }, @@ -1895,25 +1379,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521 }, ] -[[package]] -name = "stdlib-list" -version = "0.10.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/39/bb/1cdbc326a5ab0026602e0489cbf02357e78140253c4b57cd866d380eb355/stdlib_list-0.10.0.tar.gz", hash = "sha256:6519c50d645513ed287657bfe856d527f277331540691ddeaf77b25459964a14", size = 59447 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/13/d9/9085375f0d23a4896b307bf14dcc61b49ec8cc67cb33e06cf95bf3af3966/stdlib_list-0.10.0-py3-none-any.whl", hash = "sha256:b3a911bc441d03e0332dd1a9e7d0870ba3bb0a542a74d7524f54fb431256e214", size = 79814 }, -] - [[package]] name = "stdlib-list" version = "0.11.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/5d/04/6b37a71e92ddca16b190b7df62494ac4779d58ced4787f73584eb32c8f03/stdlib_list-0.11.0.tar.gz", hash = "sha256:b74a7b643a77a12637e907f3f62f0ab9f67300bce4014f6b2d3c8b4c8fd63c66", size = 60335 } wheels = [ { url = "https://files.pythonhosted.org/packages/16/fe/e07300c027a868d32d8ed7a425503401e91a03ff90e7ca525c115c634ffb/stdlib_list-0.11.0-py3-none-any.whl", hash = "sha256:8bf8decfffaaf273d4cfeb5bd852b910a00dec1037dcf163576803622bccf597", size = 83617 }, @@ -1994,28 +1463,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 }, ] -[[package]] -name = "urllib3" -version = "2.2.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338 }, -] - [[package]] name = "urllib3" version = "2.3.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 } wheels = [ { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, @@ -2091,17 +1542,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/b1/0bb11e29aa5139d90b770ebbfa167267b1fc548d2302c30c8f7572851738/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f", size = 106377 }, { url = "https://files.pythonhosted.org/packages/6a/e1/0122853035b40b3f333bbb25f1939fc1045e21dd518f7f0922b60c156f7c/wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555", size = 37986 }, { url = "https://files.pythonhosted.org/packages/09/5e/1655cf481e079c1f22d0cabdd4e51733679932718dc23bf2db175f329b76/wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c", size = 40750 }, - { url = "https://files.pythonhosted.org/packages/0c/66/95b9e90e6e1274999b183c9c3f984996d870e933ca9560115bd1cd1d6f77/wrapt-1.17.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c803c401ea1c1c18de70a06a6f79fcc9c5acfc79133e9869e730ad7f8ad8ef9", size = 53234 }, - { url = "https://files.pythonhosted.org/packages/a4/b6/6eced5e2db5924bf6d9223d2bb96b62e00395aae77058e6a9e11bf16b3bd/wrapt-1.17.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f917c1180fdb8623c2b75a99192f4025e412597c50b2ac870f156de8fb101119", size = 38462 }, - { url = "https://files.pythonhosted.org/packages/5d/a4/c8472fe2568978b5532df84273c53ddf713f689d408a4335717ab89547e0/wrapt-1.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ecc840861360ba9d176d413a5489b9a0aff6d6303d7e733e2c4623cfa26904a6", size = 38730 }, - { url = "https://files.pythonhosted.org/packages/3c/70/1d259c6b1ad164eb23ff70e3e452dd1950f96e6473f72b7207891d0fd1f0/wrapt-1.17.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb87745b2e6dc56361bfde481d5a378dc314b252a98d7dd19a651a3fa58f24a9", size = 86225 }, - { url = "https://files.pythonhosted.org/packages/a9/68/6b83367e1afb8de91cbea4ef8e85b58acdf62f034f05d78c7b82afaa23d8/wrapt-1.17.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58455b79ec2661c3600e65c0a716955adc2410f7383755d537584b0de41b1d8a", size = 78055 }, - { url = "https://files.pythonhosted.org/packages/0d/21/09573d2443916705c57fdab85d508f592c0a58d57becc53e15755d67fba2/wrapt-1.17.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4e42a40a5e164cbfdb7b386c966a588b1047558a990981ace551ed7e12ca9c2", size = 85592 }, - { url = "https://files.pythonhosted.org/packages/45/ce/700e17a852dd5dec894e241c72973ea82363486bcc1fb05d47b4fbd1d683/wrapt-1.17.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:91bd7d1773e64019f9288b7a5101f3ae50d3d8e6b1de7edee9c2ccc1d32f0c0a", size = 83906 }, - { url = "https://files.pythonhosted.org/packages/37/14/bd210faf0a66faeb8529d42b6b45a25d6aa6ce25ddfc19168e4161aed227/wrapt-1.17.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bb90fb8bda722a1b9d48ac1e6c38f923ea757b3baf8ebd0c82e09c5c1a0e7a04", size = 76763 }, - { url = "https://files.pythonhosted.org/packages/34/0c/85af70d291f44659c422416f0272046109e785bf6db8c081cfeeae5715c5/wrapt-1.17.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:08e7ce672e35efa54c5024936e559469436f8b8096253404faeb54d2a878416f", size = 83573 }, - { url = "https://files.pythonhosted.org/packages/f8/1e/b215068e824878f69ea945804fa26c176f7c2735a3ad5367d78930bd076a/wrapt-1.17.2-cp38-cp38-win32.whl", hash = "sha256:410a92fefd2e0e10d26210e1dfb4a876ddaf8439ef60d6434f21ef8d87efc5b7", size = 36408 }, - { url = "https://files.pythonhosted.org/packages/52/27/3dd9ad5f1097b33c95d05929e409cc86d7c765cb5437b86694dc8f8e9af0/wrapt-1.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:95c658736ec15602da0ed73f312d410117723914a5c91a14ee4cdd72f1d790b3", size = 38737 }, { url = "https://files.pythonhosted.org/packages/8a/f4/6ed2b8f6f1c832933283974839b88ec7c983fd12905e01e97889dadf7559/wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a", size = 53308 }, { url = "https://files.pythonhosted.org/packages/a2/a9/712a53f8f4f4545768ac532619f6e56d5d0364a87b2212531685e89aeef8/wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061", size = 38489 }, { url = "https://files.pythonhosted.org/packages/fa/9b/e172c8f28a489a2888df18f953e2f6cb8d33b1a2e78c9dfc52d8bf6a5ead/wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82", size = 38776 }, @@ -2116,25 +1556,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2d/82/f56956041adef78f849db6b289b282e72b55ab8045a75abad81898c28d19/wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8", size = 23594 }, ] -[[package]] -name = "zipp" -version = "3.20.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/54/bf/5c0000c44ebc80123ecbdddba1f5dcd94a5ada602a9c225d84b5aaa55e86/zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29", size = 24199 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/8b/5ba542fa83c90e09eac972fc9baca7a88e7e7ca4b221a89251954019308b/zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350", size = 9200 }, -] - [[package]] name = "zipp" version = "3.21.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/3f/50/bad581df71744867e9468ebd0bcd6505de3b275e06f202c2cb016e3ff56f/zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4", size = 24545 } wheels = [ { url = "https://files.pythonhosted.org/packages/b7/1a/7e4798e9339adc931158c9d69ecc34f5e6791489d469f5e50ec15e35f458/zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931", size = 9630 }, From b194a8772e58ccefc697e11671113127a8038716 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 12 Mar 2025 14:25:32 -0400 Subject: [PATCH 109/248] feat/improve ruff test coverage (#1055) * Run python tests on all currently supported python versions * Update ruff checks to select all * Ruff auto fix * Applying ruff suggestions * noqa rules updates per ruff checks * Working through more ruff suggestions * Working through more ruff suggestions * update timestamps on tests * More ruff updates * More ruff updates * Instead of importing udf static functions as variables, import * More ruff formatting suggestions * more ruff formatting suggestions * More ruff formatting * More ruff formatting * Cut off lint errors for this PR * Working through more ruff checks and disabling a bunch for now * Address CI difference from local ruff * UDWF isn't a proper abstract base class right now since users can opt in to all methods * Update pre-commit to match the version of ruff used in CI * To enable testing in python 3.9 we need numpy. Also going to the current minimal supported version * Update min requried version of python to 3.9 in pyproject.toml. The other changes will come in #1043 that is soon to be merged. * Suppress UP035 * ruff format --- .github/workflows/test.yaml | 2 + .pre-commit-config.yaml | 2 +- benchmarks/tpch/tpch.py | 14 +- dev/release/check-rat-report.py | 2 +- dev/release/generate-changelog.py | 10 +- docs/source/conf.py | 2 +- examples/python-udwf.py | 2 +- examples/tpch/_tests.py | 15 +- pyproject.toml | 76 +++++- python/datafusion/__init__.py | 50 ++-- python/datafusion/common.py | 14 +- python/datafusion/context.py | 4 +- python/datafusion/dataframe.py | 15 +- python/datafusion/expr.py | 94 +++---- python/datafusion/functions.py | 46 ++-- python/datafusion/input/__init__.py | 2 +- python/datafusion/input/base.py | 6 +- python/datafusion/input/location.py | 40 +-- python/datafusion/io.py | 20 +- python/datafusion/object_store.py | 2 +- python/datafusion/plan.py | 8 +- python/datafusion/record_batch.py | 8 +- python/datafusion/substrait.py | 21 +- python/datafusion/udf.py | 236 +++++++++-------- python/tests/generic.py | 19 +- python/tests/test_aggregation.py | 16 +- python/tests/test_catalog.py | 9 +- python/tests/test_context.py | 53 ++-- python/tests/test_dataframe.py | 38 ++- python/tests/test_expr.py | 11 +- python/tests/test_functions.py | 358 ++++++++++++++------------ python/tests/test_imports.py | 7 +- python/tests/test_input.py | 12 +- python/tests/test_io.py | 13 +- python/tests/test_sql.py | 35 +-- python/tests/test_store.py | 13 +- python/tests/test_substrait.py | 2 +- python/tests/test_udaf.py | 10 +- python/tests/test_udwf.py | 2 +- python/tests/test_wrapper_coverage.py | 7 +- 40 files changed, 697 insertions(+), 599 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c1d9ac838..da3582766 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -33,9 +33,11 @@ jobs: fail-fast: false matrix: python-version: + - "3.9" - "3.10" - "3.11" - "3.12" + - "3.13" toolchain: - "stable" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b548ff18f..abcfcf321 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: actionlint-docker - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.3.0 + rev: v0.9.10 hooks: # Run the linter. - id: ruff diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index fb86b12b6..bfb9ac398 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -59,13 +59,13 @@ def bench(data_path, query_path): end = time.time() time_millis = (end - start) * 1000 total_time_millis += time_millis - print("setup,{}".format(round(time_millis, 1))) - results.write("setup,{}\n".format(round(time_millis, 1))) + print(f"setup,{round(time_millis, 1)}") + results.write(f"setup,{round(time_millis, 1)}\n") results.flush() # run queries for query in range(1, 23): - with open("{}/q{}.sql".format(query_path, query)) as f: + with open(f"{query_path}/q{query}.sql") as f: text = f.read() tmp = text.split(";") queries = [] @@ -83,14 +83,14 @@ def bench(data_path, query_path): end = time.time() time_millis = (end - start) * 1000 total_time_millis += time_millis - print("q{},{}".format(query, round(time_millis, 1))) - results.write("q{},{}\n".format(query, round(time_millis, 1))) + print(f"q{query},{round(time_millis, 1)}") + results.write(f"q{query},{round(time_millis, 1)}\n") results.flush() except Exception as e: print("query", query, "failed", e) - print("total,{}".format(round(total_time_millis, 1))) - results.write("total,{}\n".format(round(total_time_millis, 1))) + print(f"total,{round(total_time_millis, 1)}") + results.write(f"total,{round(total_time_millis, 1)}\n") if __name__ == "__main__": diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py index d3dd7c5dd..0c9f4c326 100644 --- a/dev/release/check-rat-report.py +++ b/dev/release/check-rat-report.py @@ -29,7 +29,7 @@ exclude_globs_filename = sys.argv[1] xml_filename = sys.argv[2] -globs = [line.strip() for line in open(exclude_globs_filename, "r")] +globs = [line.strip() for line in open(exclude_globs_filename)] tree = ET.parse(xml_filename) root = tree.getroot() diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index 2564eea86..e30e2def2 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -26,15 +26,11 @@ def print_pulls(repo_name, title, pulls): if len(pulls) > 0: - print("**{}:**".format(title)) + print(f"**{title}:**") print() for pull, commit in pulls: - url = "https://github.com/{}/pull/{}".format(repo_name, pull.number) - print( - "- {} [#{}]({}) ({})".format( - pull.title, pull.number, url, commit.author.login - ) - ) + url = f"https://github.com/{repo_name}/pull/{pull.number}" + print(f"- {pull.title} [#{pull.number}]({url}) ({commit.author.login})") print() diff --git a/docs/source/conf.py b/docs/source/conf.py index 2e5a41339..c82a189e0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -73,7 +73,7 @@ autoapi_python_class_content = "both" -def autoapi_skip_member_fn(app, what, name, obj, skip, options): +def autoapi_skip_member_fn(app, what, name, obj, skip, options): # noqa: ARG001 skip_contents = [ # Re-exports ("class", "datafusion.DataFrame"), diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 7d39dc1b8..98d118bf2 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -59,7 +59,7 @@ def __init__(self, alpha: float) -> None: def supports_bounded_execution(self) -> bool: return True - def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: # noqa: ARG002 # Override the default range of current row since uses_window_frame is False # So for the purpose of this test we just smooth from the previous row to # current. diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index c4d872085..2be4dfabd 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -27,28 +27,25 @@ def df_selection(col_name, col_type): if col_type == pa.float64() or isinstance(col_type, pa.Decimal128Type): return F.round(col(col_name), lit(2)).alias(col_name) - elif col_type == pa.string() or col_type == pa.string_view(): + if col_type == pa.string() or col_type == pa.string_view(): return F.trim(col(col_name)).alias(col_name) - else: - return col(col_name) + return col(col_name) def load_schema(col_name, col_type): if col_type == pa.int64() or col_type == pa.int32(): return col_name, pa.string() - elif isinstance(col_type, pa.Decimal128Type): + if isinstance(col_type, pa.Decimal128Type): return col_name, pa.float64() - else: - return col_name, col_type + return col_name, col_type def expected_selection(col_name, col_type): if col_type == pa.int64() or col_type == pa.int32(): return F.trim(col(col_name)).cast(col_type).alias(col_name) - elif col_type == pa.string() or col_type == pa.string_view(): + if col_type == pa.string() or col_type == pa.string_view(): return F.trim(col(col_name)).alias(col_name) - else: - return col(col_name) + return col(col_name) def selections_and_schema(original_schema): diff --git a/pyproject.toml b/pyproject.toml index 1c2733677..060e3b80a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,57 @@ features = ["substrait"] # Enable docstring linting using the google style guide [tool.ruff.lint] -select = ["E4", "E7", "E9", "F", "FA", "D", "W", "I"] +select = ["ALL" ] +ignore = [ + "A001", # Allow using words like min as variable names + "A002", # Allow using words like filter as variable names + "ANN401", # Allow Any for wrapper classes + "COM812", # Recommended to ignore these rules when using with ruff-format + "FIX002", # Allow TODO lines - consider removing at some point + "FBT001", # Allow boolean positional args + "FBT002", # Allow boolean positional args + "ISC001", # Recommended to ignore these rules when using with ruff-format + "SLF001", # Allow accessing private members + "TD002", + "TD003", # Allow TODO lines + "UP007", # Disallowing Union is pedantic + # TODO: Enable all of the following, but this PR is getting too large already + "PT001", + "ANN204", + "B008", + "EM101", + "PLR0913", + "PLR1714", + "ANN201", + "C400", + "TRY003", + "B904", + "UP006", + "RUF012", + "FBT003", + "C416", + "SIM102", + "PGH003", + "PLR2004", + "PERF401", + "PD901", + "EM102", + "ERA001", + "SIM108", + "ICN001", + "ANN001", + "ANN202", + "PTH", + "N812", + "INP001", + "DTZ007", + "PLW2901", + "RET503", + "RUF015", + "A005", + "TC001", + "UP035", +] [tool.ruff.lint.pydocstyle] convention = "google" @@ -75,16 +125,30 @@ max-doc-length = 88 # Disable docstring checking for these directories [tool.ruff.lint.per-file-ignores] -"python/tests/*" = ["D"] -"examples/*" = ["D", "W505"] -"dev/*" = ["D"] -"benchmarks/*" = ["D", "F"] +"python/tests/*" = [ + "ANN", + "ARG", + "BLE001", + "D", + "S101", + "SLF", + "PD", + "PLR2004", + "PT011", + "RUF015", + "S608", + "PLR0913", + "PT004", +] +"examples/*" = ["D", "W505", "E501", "T201", "S101"] +"dev/*" = ["D", "E", "T", "S", "PLR", "C", "SIM", "UP", "EXE", "N817"] +"benchmarks/*" = ["D", "F", "T", "BLE", "FURB", "PLR", "E", "TD", "TRY", "S", "SIM", "EXE", "UP"] "docs/*" = ["D"] [dependency-groups] dev = [ "maturin>=1.8.1", - "numpy>1.24.4 ; python_full_version >= '3.10'", + "numpy>1.25.0", "pytest>=7.4.4", "ruff>=0.9.1", "toml>=0.10.2", diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index f11ce54a6..286e5dc31 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -48,44 +48,47 @@ from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream -from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF +from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF, udaf, udf, udwf __version__ = importlib_metadata.version(__name__) __all__ = [ "Accumulator", + "AggregateUDF", + "Catalog", "Config", - "DataFrame", - "SessionContext", - "SessionConfig", - "SQLOptions", - "RuntimeEnvBuilder", - "Expr", - "ScalarUDF", - "WindowFrame", - "column", - "col", - "literal", - "lit", "DFSchema", - "Catalog", + "DataFrame", "Database", - "Table", - "AggregateUDF", - "WindowUDF", - "LogicalPlan", "ExecutionPlan", + "Expr", + "LogicalPlan", "RecordBatch", "RecordBatchStream", + "RuntimeEnvBuilder", + "SQLOptions", + "ScalarUDF", + "SessionConfig", + "SessionContext", + "Table", + "WindowFrame", + "WindowUDF", + "col", + "column", "common", "expr", "functions", + "lit", + "literal", "object_store", - "substrait", - "read_parquet", "read_avro", "read_csv", "read_json", + "read_parquet", + "substrait", + "udaf", + "udf", + "udwf", ] @@ -120,10 +123,3 @@ def str_lit(value): def lit(value): """Create a literal expression.""" return Expr.literal(value) - - -udf = ScalarUDF.udf - -udaf = AggregateUDF.udaf - -udwf = WindowUDF.udwf diff --git a/python/datafusion/common.py b/python/datafusion/common.py index a2298c634..e762a993b 100644 --- a/python/datafusion/common.py +++ b/python/datafusion/common.py @@ -20,7 +20,7 @@ from ._internal import common as common_internal -# TODO these should all have proper wrapper classes +# TODO: these should all have proper wrapper classes DFSchema = common_internal.DFSchema DataType = common_internal.DataType @@ -38,15 +38,15 @@ "DFSchema", "DataType", "DataTypeMap", - "RexType", - "PythonType", - "SqlType", "NullTreatment", - "SqlTable", + "PythonType", + "RexType", + "SqlFunction", "SqlSchema", - "SqlView", "SqlStatistics", - "SqlFunction", + "SqlTable", + "SqlType", + "SqlView", ] diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 282b2a477..0ab1a908a 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -393,8 +393,6 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder: class RuntimeConfig(RuntimeEnvBuilder): """See `RuntimeEnvBuilder`.""" - pass - class SQLOptions: """Options to be used when performing SQL queries.""" @@ -498,7 +496,7 @@ def __init__( self.ctx = SessionContextInternal(config, runtime) - def enable_url_table(self) -> "SessionContext": + def enable_url_table(self) -> SessionContext: """Control if local files can be queried as tables. Returns: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index de5d8376e..d1c71c2bb 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -29,6 +29,7 @@ List, Literal, Optional, + Type, Union, overload, ) @@ -49,10 +50,11 @@ import polars as pl import pyarrow as pa + from datafusion._internal import DataFrame as DataFrameInternal + from datafusion._internal import expr as expr_internal + from enum import Enum -from datafusion._internal import DataFrame as DataFrameInternal -from datafusion._internal import expr as expr_internal from datafusion.expr import Expr, SortExpr, sort_or_default @@ -73,7 +75,7 @@ class Compression(Enum): LZ4_RAW = "lz4_raw" @classmethod - def from_str(cls, value: str) -> "Compression": + def from_str(cls: Type[Compression], value: str) -> Compression: """Convert a string to a Compression enum value. Args: @@ -88,8 +90,9 @@ def from_str(cls, value: str) -> "Compression": try: return cls(value.lower()) except ValueError: + valid_values = str([item.value for item in Compression]) raise ValueError( - f"{value} is not a valid Compression. Valid values are: {[item.value for item in Compression]}" + f"{value} is not a valid Compression. Valid values are: {valid_values}" ) def get_default_level(self) -> Optional[int]: @@ -104,9 +107,9 @@ def get_default_level(self) -> Optional[int]: # https://github.com/apache/datafusion-python/pull/981#discussion_r1904789223 if self == Compression.GZIP: return 6 - elif self == Compression.BROTLI: + if self == Compression.BROTLI: return 1 - elif self == Compression.ZSTD: + if self == Compression.ZSTD: return 4 return None diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 3639abec6..702f75aed 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -101,63 +101,63 @@ WindowExpr = expr_internal.WindowExpr __all__ = [ - "Expr", - "Column", - "Literal", - "BinaryExpr", - "Literal", + "Aggregate", "AggregateFunction", - "Not", - "IsNotNull", - "IsNull", - "IsTrue", - "IsFalse", - "IsUnknown", - "IsNotTrue", - "IsNotFalse", - "IsNotUnknown", - "Negative", - "Like", - "ILike", - "SimilarTo", - "ScalarVariable", "Alias", - "InList", - "Exists", - "Subquery", - "InSubquery", - "ScalarSubquery", - "Placeholder", - "GroupingSet", + "Analyze", + "Between", + "BinaryExpr", "Case", "CaseBuilder", "Cast", - "TryCast", - "Between", + "Column", + "CreateMemoryTable", + "CreateView", + "Distinct", + "DropTable", + "EmptyRelation", + "Exists", "Explain", + "Expr", + "Extension", + "Filter", + "GroupingSet", + "ILike", + "InList", + "InSubquery", + "IsFalse", + "IsNotFalse", + "IsNotNull", + "IsNotTrue", + "IsNotUnknown", + "IsNull", + "IsTrue", + "IsUnknown", + "Join", + "JoinConstraint", + "JoinType", + "Like", "Limit", - "Aggregate", + "Literal", + "Literal", + "Negative", + "Not", + "Partitioning", + "Placeholder", + "Projection", + "Repartition", + "ScalarSubquery", + "ScalarVariable", + "SimilarTo", "Sort", "SortExpr", - "Analyze", - "EmptyRelation", - "Join", - "JoinType", - "JoinConstraint", + "Subquery", + "SubqueryAlias", + "TableScan", + "TryCast", "Union", "Unnest", "UnnestExpr", - "Extension", - "Filter", - "Projection", - "TableScan", - "CreateMemoryTable", - "CreateView", - "Distinct", - "SubqueryAlias", - "DropTable", - "Partitioning", - "Repartition", "Window", "WindowExpr", "WindowFrame", @@ -311,7 +311,7 @@ def __getitem__(self, key: str | int) -> Expr: ) return Expr(self.expr.__getitem__(key)) - def __eq__(self, rhs: Any) -> Expr: + def __eq__(self, rhs: object) -> Expr: """Equal to. Accepts either an expression or any valid PyArrow scalar literal value. @@ -320,7 +320,7 @@ def __eq__(self, rhs: Any) -> Expr: rhs = Expr.literal(rhs) return Expr(self.expr.__eq__(rhs.expr)) - def __ne__(self, rhs: Any) -> Expr: + def __ne__(self, rhs: object) -> Expr: """Not equal to. Accepts either an expression or any valid PyArrow scalar literal value. diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index b449c4868..0cc7434cf 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,13 +18,12 @@ from __future__ import annotations -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional import pyarrow as pa from datafusion._internal import functions as f from datafusion.common import NullTreatment -from datafusion.context import SessionContext from datafusion.expr import ( CaseBuilder, Expr, @@ -34,6 +33,9 @@ sort_list_to_raw_sort_list, ) +if TYPE_CHECKING: + from datafusion.context import SessionContext + __all__ = [ "abs", "acos", @@ -81,8 +83,8 @@ "array_sort", "array_to_string", "array_union", - "arrow_typeof", "arrow_cast", + "arrow_typeof", "ascii", "asin", "asinh", @@ -97,6 +99,7 @@ "bool_and", "bool_or", "btrim", + "cardinality", "case", "cbrt", "ceil", @@ -116,6 +119,7 @@ "covar", "covar_pop", "covar_samp", + "cume_dist", "current_date", "current_time", "date_bin", @@ -125,17 +129,17 @@ "datetrunc", "decode", "degrees", + "dense_rank", "digest", "empty", "encode", "ends_with", - "extract", "exp", + "extract", "factorial", "find_in_set", "first_value", "flatten", - "cardinality", "floor", "from_unixtime", "gcd", @@ -143,8 +147,10 @@ "initcap", "isnan", "iszero", + "lag", "last_value", "lcm", + "lead", "left", "length", "levenshtein", @@ -166,10 +172,10 @@ "list_prepend", "list_push_back", "list_push_front", - "list_repeat", "list_remove", "list_remove_all", "list_remove_n", + "list_repeat", "list_replace", "list_replace_all", "list_replace_n", @@ -180,14 +186,14 @@ "list_union", "ln", "log", - "log10", "log2", + "log10", "lower", "lpad", "ltrim", "make_array", - "make_list", "make_date", + "make_list", "max", "md5", "mean", @@ -195,19 +201,22 @@ "min", "named_struct", "nanvl", - "nvl", "now", "nth_value", + "ntile", "nullif", + "nvl", "octet_length", "order_by", "overlay", + "percent_rank", "pi", "pow", "power", "radians", "random", "range", + "rank", "regexp_like", "regexp_match", "regexp_replace", @@ -225,6 +234,7 @@ "reverse", "right", "round", + "row_number", "rpad", "rtrim", "sha224", @@ -252,8 +262,8 @@ "to_hex", "to_timestamp", "to_timestamp_micros", - "to_timestamp_nanos", "to_timestamp_millis", + "to_timestamp_nanos", "to_timestamp_seconds", "to_unixtime", "translate", @@ -268,14 +278,6 @@ "when", # Window Functions "window", - "lead", - "lag", - "row_number", - "rank", - "dense_rank", - "percent_rank", - "cume_dist", - "ntile", ] @@ -292,14 +294,14 @@ def nullif(expr1: Expr, expr2: Expr) -> Expr: return Expr(f.nullif(expr1.expr, expr2.expr)) -def encode(input: Expr, encoding: Expr) -> Expr: +def encode(expr: Expr, encoding: Expr) -> Expr: """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" - return Expr(f.encode(input.expr, encoding.expr)) + return Expr(f.encode(expr.expr, encoding.expr)) -def decode(input: Expr, encoding: Expr) -> Expr: +def decode(expr: Expr, encoding: Expr) -> Expr: """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" - return Expr(f.decode(input.expr, encoding.expr)) + return Expr(f.decode(expr.expr, encoding.expr)) def array_to_string(expr: Expr, delimiter: Expr) -> Expr: diff --git a/python/datafusion/input/__init__.py b/python/datafusion/input/__init__.py index f85ce21f0..f0c1f42b4 100644 --- a/python/datafusion/input/__init__.py +++ b/python/datafusion/input/__init__.py @@ -23,5 +23,5 @@ from .location import LocationInputPlugin __all__ = [ - LocationInputPlugin, + "LocationInputPlugin", ] diff --git a/python/datafusion/input/base.py b/python/datafusion/input/base.py index 4eba19784..f67dde2a1 100644 --- a/python/datafusion/input/base.py +++ b/python/datafusion/input/base.py @@ -38,11 +38,9 @@ class BaseInputSource(ABC): """ @abstractmethod - def is_correct_input(self, input_item: Any, table_name: str, **kwargs) -> bool: + def is_correct_input(self, input_item: Any, table_name: str, **kwargs: Any) -> bool: """Returns `True` if the input is valid.""" - pass @abstractmethod - def build_table(self, input_item: Any, table_name: str, **kwarg) -> SqlTable: + def build_table(self, input_item: Any, table_name: str, **kwarg: Any) -> SqlTable: # type: ignore[invalid-type-form] """Create a table from the input source.""" - pass diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index 517cd1578..08d98d115 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -18,7 +18,7 @@ """The default input source for DataFusion.""" import glob -import os +from pathlib import Path from typing import Any from datafusion.common import DataTypeMap, SqlTable @@ -31,7 +31,7 @@ class LocationInputPlugin(BaseInputSource): This can be read in from a file (on disk, remote etc.). """ - def is_correct_input(self, input_item: Any, table_name: str, **kwargs): + def is_correct_input(self, input_item: Any, table_name: str, **kwargs: Any) -> bool: # noqa: ARG002 """Returns `True` if the input is valid.""" return isinstance(input_item, str) @@ -39,27 +39,28 @@ def build_table( self, input_item: str, table_name: str, - **kwargs, - ) -> SqlTable: + **kwargs: Any, # noqa: ARG002 + ) -> SqlTable: # type: ignore[invalid-type-form] """Create a table from the input source.""" - _, extension = os.path.splitext(input_item) - format = extension.lstrip(".").lower() + extension = Path(input_item).suffix + file_format = extension.lstrip(".").lower() num_rows = 0 # Total number of rows in the file. Used for statistics columns = [] - if format == "parquet": + if file_format == "parquet": import pyarrow.parquet as pq # Read the Parquet metadata metadata = pq.read_metadata(input_item) num_rows = metadata.num_rows # Iterate through the schema and build the SqlTable - for col in metadata.schema: - columns.append( - ( - col.name, - DataTypeMap.from_parquet_type_str(col.physical_type), - ) + columns = [ + ( + col.name, + DataTypeMap.from_parquet_type_str(col.physical_type), ) + for col in metadata.schema + ] + elif format == "csv": import csv @@ -69,19 +70,18 @@ def build_table( # to get that information. However, this should only be occurring # at table creation time and therefore shouldn't # slow down query performance. - with open(input_item, "r") as file: + with Path(input_item).open() as file: reader = csv.reader(file) - header_row = next(reader) - print(header_row) + _header_row = next(reader) for _ in reader: num_rows += 1 # TODO: Need to actually consume this row into reasonable columns - raise RuntimeError("TODO: Currently unable to support CSV input files.") + msg = "TODO: Currently unable to support CSV input files." + raise RuntimeError(msg) else: - raise RuntimeError( - f"Input of format: `{format}` is currently not supported.\ + msg = f"Input of format: `{format}` is currently not supported.\ Only Parquet and CSV." - ) + raise RuntimeError(msg) # Input could possibly be multiple files. Create a list if so input_files = glob.glob(input_item) diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 3b6264948..3e39703e3 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -19,15 +19,19 @@ from __future__ import annotations -import pathlib - -import pyarrow +from typing import TYPE_CHECKING from datafusion.dataframe import DataFrame -from datafusion.expr import Expr from ._internal import SessionContext as SessionContextInternal +if TYPE_CHECKING: + import pathlib + + import pyarrow as pa + + from datafusion.expr import Expr + def read_parquet( path: str | pathlib.Path, @@ -35,7 +39,7 @@ def read_parquet( parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[Expr]] | None = None, ) -> DataFrame: """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. @@ -79,7 +83,7 @@ def read_parquet( def read_json( path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", table_partition_cols: list[tuple[str, str]] | None = None, @@ -120,7 +124,7 @@ def read_json( def read_csv( path: str | pathlib.Path | list[str] | list[pathlib.Path], - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", schema_infer_max_records: int = 1000, @@ -173,7 +177,7 @@ def read_csv( def read_avro( path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".avro", ) -> DataFrame: diff --git a/python/datafusion/object_store.py b/python/datafusion/object_store.py index 7cc17506f..6298526f5 100644 --- a/python/datafusion/object_store.py +++ b/python/datafusion/object_store.py @@ -24,4 +24,4 @@ MicrosoftAzure = object_store.MicrosoftAzure Http = object_store.Http -__all__ = ["AmazonS3", "GoogleCloud", "LocalFileSystem", "MicrosoftAzure", "Http"] +__all__ = ["AmazonS3", "GoogleCloud", "Http", "LocalFileSystem", "MicrosoftAzure"] diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index 133fc446d..0b7bebcb3 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -19,7 +19,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, List +from typing import TYPE_CHECKING, Any import datafusion._internal as df_internal @@ -27,8 +27,8 @@ from datafusion.context import SessionContext __all__ = [ - "LogicalPlan", "ExecutionPlan", + "LogicalPlan", ] @@ -54,7 +54,7 @@ def to_variant(self) -> Any: """Convert the logical plan into its specific variant.""" return self._raw_plan.to_variant() - def inputs(self) -> List[LogicalPlan]: + def inputs(self) -> list[LogicalPlan]: """Returns the list of inputs to the logical plan.""" return [LogicalPlan(p) for p in self._raw_plan.inputs()] @@ -106,7 +106,7 @@ def __init__(self, plan: df_internal.ExecutionPlan) -> None: """This constructor should not be called by the end user.""" self._raw_plan = plan - def children(self) -> List[ExecutionPlan]: + def children(self) -> list[ExecutionPlan]: """Get a list of children `ExecutionPlan` that act as inputs to this plan. The returned list will be empty for leaf nodes such as scans, will contain a diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py index 772cd9089..556eaa786 100644 --- a/python/datafusion/record_batch.py +++ b/python/datafusion/record_batch.py @@ -26,14 +26,14 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - import pyarrow + import pyarrow as pa import typing_extensions import datafusion._internal as df_internal class RecordBatch: - """This class is essentially a wrapper for :py:class:`pyarrow.RecordBatch`.""" + """This class is essentially a wrapper for :py:class:`pa.RecordBatch`.""" def __init__(self, record_batch: df_internal.RecordBatch) -> None: """This constructor is generally not called by the end user. @@ -42,8 +42,8 @@ def __init__(self, record_batch: df_internal.RecordBatch) -> None: """ self.record_batch = record_batch - def to_pyarrow(self) -> pyarrow.RecordBatch: - """Convert to :py:class:`pyarrow.RecordBatch`.""" + def to_pyarrow(self) -> pa.RecordBatch: + """Convert to :py:class:`pa.RecordBatch`.""" return self.record_batch.to_pyarrow() diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 06302fe38..f10adfb0c 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -23,7 +23,6 @@ from __future__ import annotations -import pathlib from typing import TYPE_CHECKING try: @@ -36,11 +35,13 @@ from ._internal import substrait as substrait_internal if TYPE_CHECKING: + import pathlib + from datafusion.context import SessionContext __all__ = [ - "Plan", "Consumer", + "Plan", "Producer", "Serde", ] @@ -68,11 +69,9 @@ def encode(self) -> bytes: @deprecated("Use `Plan` instead.") -class plan(Plan): +class plan(Plan): # noqa: N801 """See `Plan`.""" - pass - class Serde: """Provides the ``Substrait`` serialization and deserialization.""" @@ -140,11 +139,9 @@ def deserialize_bytes(proto_bytes: bytes) -> Plan: @deprecated("Use `Serde` instead.") -class serde(Serde): +class serde(Serde): # noqa: N801 """See `Serde` instead.""" - pass - class Producer: """Generates substrait plans from a logical plan.""" @@ -168,11 +165,9 @@ def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan: @deprecated("Use `Producer` instead.") -class producer(Producer): +class producer(Producer): # noqa: N801 """Use `Producer` instead.""" - pass - class Consumer: """Generates a logical plan from a substrait plan.""" @@ -194,7 +189,5 @@ def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan: @deprecated("Use `Consumer` instead.") -class consumer(Consumer): +class consumer(Consumer): # noqa: N801 """Use `Consumer` instead.""" - - pass diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index af7bcf2ed..603b7063d 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -22,15 +22,15 @@ import functools from abc import ABCMeta, abstractmethod from enum import Enum -from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, overload -import pyarrow +import pyarrow as pa import datafusion._internal as df_internal from datafusion.expr import Expr if TYPE_CHECKING: - _R = TypeVar("_R", bound=pyarrow.DataType) + _R = TypeVar("_R", bound=pa.DataType) class Volatility(Enum): @@ -72,7 +72,7 @@ class Volatility(Enum): for each output row, resulting in a unique random value for each row. """ - def __str__(self): + def __str__(self) -> str: """Returns the string equivalent.""" return self.name.lower() @@ -88,7 +88,7 @@ def __init__( self, name: str, func: Callable[..., _R], - input_types: pyarrow.DataType | list[pyarrow.DataType], + input_types: pa.DataType | list[pa.DataType], return_type: _R, volatility: Volatility | str, ) -> None: @@ -96,7 +96,7 @@ def __init__( See helper method :py:func:`udf` for argument details. """ - if isinstance(input_types, pyarrow.DataType): + if isinstance(input_types, pa.DataType): input_types = [input_types] self._udf = df_internal.ScalarUDF( name, func, input_types, return_type, str(volatility) @@ -111,7 +111,27 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udf.__call__(*args_raw)) - class udf: + @overload + @staticmethod + def udf( + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., ScalarUDF]: ... + + @overload + @staticmethod + def udf( + func: Callable[..., _R], + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> ScalarUDF: ... + + @staticmethod + def udf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Function (UDF). This class can be used both as a **function** and as a **decorator**. @@ -125,7 +145,7 @@ class udf: Args: func (Callable, optional): **Only needed when calling as a function.** Skip this argument when using `udf` as a decorator. - input_types (list[pyarrow.DataType]): The data types of the arguments + input_types (list[pa.DataType]): The data types of the arguments to `func`. This list must be of the same length as the number of arguments. return_type (_R): The data type of the return value from the function. @@ -141,40 +161,28 @@ class udf: ``` def double_func(x): return x * 2 - double_udf = udf(double_func, [pyarrow.int32()], pyarrow.int32(), + double_udf = udf(double_func, [pa.int32()], pa.int32(), "volatile", "double_it") ``` **Using `udf` as a decorator:** ``` - @udf([pyarrow.int32()], pyarrow.int32(), "volatile", "double_it") + @udf([pa.int32()], pa.int32(), "volatile", "double_it") def double_udf(x): return x * 2 ``` """ - def __new__(cls, *args, **kwargs): - """Create a new UDF. - - Trigger UDF function or decorator depending on if the first args is callable - """ - if args and callable(args[0]): - # Case 1: Used as a function, require the first parameter to be callable - return cls._function(*args, **kwargs) - else: - # Case 2: Used as a decorator with parameters - return cls._decorator(*args, **kwargs) - - @staticmethod def _function( func: Callable[..., _R], - input_types: list[pyarrow.DataType], + input_types: list[pa.DataType], return_type: _R, volatility: Volatility | str, name: Optional[str] = None, ) -> ScalarUDF: if not callable(func): - raise TypeError("`func` argument must be callable") + msg = "`func` argument must be callable" + raise TypeError(msg) if name is None: if hasattr(func, "__qualname__"): name = func.__qualname__.lower() @@ -188,49 +196,50 @@ def _function( volatility=volatility, ) - @staticmethod def _decorator( - input_types: list[pyarrow.DataType], + input_types: list[pa.DataType], return_type: _R, volatility: Volatility | str, name: Optional[str] = None, - ): - def decorator(func): + ) -> Callable: + def decorator(func: Callable): udf_caller = ScalarUDF.udf( func, input_types, return_type, volatility, name ) @functools.wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args: Any, **kwargs: Any): return udf_caller(*args, **kwargs) return wrapper return decorator + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return _function(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return _decorator(*args, **kwargs) + class Accumulator(metaclass=ABCMeta): """Defines how an :py:class:`AggregateUDF` accumulates values.""" @abstractmethod - def state(self) -> List[pyarrow.Scalar]: + def state(self) -> list[pa.Scalar]: """Return the current state.""" - pass @abstractmethod - def update(self, *values: pyarrow.Array) -> None: + def update(self, *values: pa.Array) -> None: """Evaluate an array of values and update state.""" - pass @abstractmethod - def merge(self, states: List[pyarrow.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: """Merge a set of states.""" - pass @abstractmethod - def evaluate(self) -> pyarrow.Scalar: + def evaluate(self) -> pa.Scalar: """Return the resultant value.""" - pass class AggregateUDF: @@ -244,9 +253,9 @@ def __init__( self, name: str, accumulator: Callable[[], Accumulator], - input_types: list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], + input_types: list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], volatility: Volatility | str, ) -> None: """Instantiate a user-defined aggregate function (UDAF). @@ -272,7 +281,29 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udaf.__call__(*args_raw)) - class udaf: + @overload + @staticmethod + def udaf( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., AggregateUDF]: ... + + @overload + @staticmethod + def udaf( + accum: Callable[[], Accumulator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> AggregateUDF: ... + + @staticmethod + def udaf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Aggregate Function (UDAF). This class allows you to define an **aggregate function** that can be used in @@ -300,13 +331,13 @@ class Summarize(Accumulator): def __init__(self, bias: float = 0.0): self._sum = pa.scalar(bias) - def state(self) -> List[pa.Scalar]: + def state(self) -> list[pa.Scalar]: return [self._sum] def update(self, values: pa.Array) -> None: self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - def merge(self, states: List[pa.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) def evaluate(self) -> pa.Scalar: @@ -344,37 +375,23 @@ def udf4() -> Summarize: aggregation or window function calls. """ - def __new__(cls, *args, **kwargs): - """Create a new UDAF. - - Trigger UDAF function or decorator depending on if the first args is - callable - """ - if args and callable(args[0]): - # Case 1: Used as a function, require the first parameter to be callable - return cls._function(*args, **kwargs) - else: - # Case 2: Used as a decorator with parameters - return cls._decorator(*args, **kwargs) - - @staticmethod def _function( accum: Callable[[], Accumulator], - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], volatility: Volatility | str, name: Optional[str] = None, ) -> AggregateUDF: if not callable(accum): - raise TypeError("`func` must be callable.") - if not isinstance(accum.__call__(), Accumulator): - raise TypeError( - "Accumulator must implement the abstract base class Accumulator" - ) + msg = "`func` must be callable." + raise TypeError(msg) + if not isinstance(accum(), Accumulator): + msg = "Accumulator must implement the abstract base class Accumulator" + raise TypeError(msg) if name is None: - name = accum.__call__().__class__.__qualname__.lower() - if isinstance(input_types, pyarrow.DataType): + name = accum().__class__.__qualname__.lower() + if isinstance(input_types, pa.DataType): input_types = [input_types] return AggregateUDF( name=name, @@ -385,29 +402,34 @@ def _function( volatility=volatility, ) - @staticmethod def _decorator( - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], volatility: Volatility | str, name: Optional[str] = None, - ): - def decorator(accum: Callable[[], Accumulator]): + ) -> Callable[..., Callable[..., Expr]]: + def decorator(accum: Callable[[], Accumulator]) -> Callable[..., Expr]: udaf_caller = AggregateUDF.udaf( accum, input_types, return_type, state_type, volatility, name ) @functools.wraps(accum) - def wrapper(*args, **kwargs): + def wrapper(*args: Any, **kwargs: Any) -> Expr: return udaf_caller(*args, **kwargs) return wrapper return decorator + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return _function(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return _decorator(*args, **kwargs) + -class WindowEvaluator(metaclass=ABCMeta): +class WindowEvaluator: """Evaluator class for user-defined window functions (UDWF). It is up to the user to decide which evaluate function is appropriate. @@ -423,7 +445,7 @@ class WindowEvaluator(metaclass=ABCMeta): +------------------------+--------------------------------+------------------+---------------------------+ | True | True/False | True/False | ``evaluate`` | +------------------------+--------------------------------+------------------+---------------------------+ - """ # noqa: W505 + """ # noqa: W505, E501 def memoize(self) -> None: """Perform a memoize operation to improve performance. @@ -436,9 +458,8 @@ def memoize(self) -> None: `memoize` is called after each input batch is processed, and such functions can save whatever they need """ - pass - def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: # noqa: ARG002 """Return the range for the window fuction. If `uses_window_frame` flag is `false`. This method is used to @@ -460,14 +481,17 @@ def is_causal(self) -> bool: """Get whether evaluator needs future data for its result.""" return False - def evaluate_all(self, values: list[pyarrow.Array], num_rows: int) -> pyarrow.Array: + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: """Evaluate a window function on an entire input partition. This function is called once per input *partition* for window functions that *do not use* values from the window frame, such as - :py:func:`~datafusion.functions.row_number`, :py:func:`~datafusion.functions.rank`, - :py:func:`~datafusion.functions.dense_rank`, :py:func:`~datafusion.functions.percent_rank`, - :py:func:`~datafusion.functions.cume_dist`, :py:func:`~datafusion.functions.lead`, + :py:func:`~datafusion.functions.row_number`, + :py:func:`~datafusion.functions.rank`, + :py:func:`~datafusion.functions.dense_rank`, + :py:func:`~datafusion.functions.percent_rank`, + :py:func:`~datafusion.functions.cume_dist`, + :py:func:`~datafusion.functions.lead`, and :py:func:`~datafusion.functions.lag`. It produces the result of all rows in a single pass. It @@ -499,12 +523,11 @@ def evaluate_all(self, values: list[pyarrow.Array], num_rows: int) -> pyarrow.Ar .. code-block:: text avg(x) OVER (PARTITION BY y ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) - """ # noqa: W505 - pass + """ # noqa: W505, E501 def evaluate( - self, values: list[pyarrow.Array], eval_range: tuple[int, int] - ) -> pyarrow.Scalar: + self, values: list[pa.Array], eval_range: tuple[int, int] + ) -> pa.Scalar: """Evaluate window function on a range of rows in an input partition. This is the simplest and most general function to implement @@ -519,11 +542,10 @@ def evaluate( and evaluation results of ORDER BY expressions. If function has a single argument, `values[1..]` will contain ORDER BY expression results. """ - pass def evaluate_all_with_rank( self, num_rows: int, ranks_in_partition: list[tuple[int, int]] - ) -> pyarrow.Array: + ) -> pa.Array: """Called for window functions that only need the rank of a row. Evaluate the partition evaluator against the partition using @@ -552,7 +574,6 @@ def evaluate_all_with_rank( The user must implement this method if ``include_rank`` returns True. """ - pass def supports_bounded_execution(self) -> bool: """Can the window function be incrementally computed using bounded memory?""" @@ -567,10 +588,6 @@ def include_rank(self) -> bool: return False -if TYPE_CHECKING: - _W = TypeVar("_W", bound=WindowEvaluator) - - class WindowUDF: """Class for performing window user-defined functions (UDF). @@ -582,8 +599,8 @@ def __init__( self, name: str, func: Callable[[], WindowEvaluator], - input_types: list[pyarrow.DataType], - return_type: pyarrow.DataType, + input_types: list[pa.DataType], + return_type: pa.DataType, volatility: Volatility | str, ) -> None: """Instantiate a user-defined window function (UDWF). @@ -607,8 +624,8 @@ def __call__(self, *args: Expr) -> Expr: @staticmethod def udwf( func: Callable[[], WindowEvaluator], - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, volatility: Volatility | str, name: Optional[str] = None, ) -> WindowUDF: @@ -648,16 +665,16 @@ def bias_10() -> BiasedNumbers: Returns: A user-defined window function. - """ # noqa W505 + """ # noqa: W505, E501 if not callable(func): - raise TypeError("`func` must be callable.") - if not isinstance(func.__call__(), WindowEvaluator): - raise TypeError( - "`func` must implement the abstract base class WindowEvaluator" - ) + msg = "`func` must be callable." + raise TypeError(msg) + if not isinstance(func(), WindowEvaluator): + msg = "`func` must implement the abstract base class WindowEvaluator" + raise TypeError(msg) if name is None: - name = func.__call__().__class__.__qualname__.lower() - if isinstance(input_types, pyarrow.DataType): + name = func().__class__.__qualname__.lower() + if isinstance(input_types, pa.DataType): input_types = [input_types] return WindowUDF( name=name, @@ -666,3 +683,10 @@ def bias_10() -> BiasedNumbers: return_type=return_type, volatility=volatility, ) + + +# Convenience exports so we can import instead of treating as +# variables at the package root +udf = ScalarUDF.udf +udaf = AggregateUDF.udaf +udwf = WindowUDF.udwf diff --git a/python/tests/generic.py b/python/tests/generic.py index 0177e2df0..1b98fdf9e 100644 --- a/python/tests/generic.py +++ b/python/tests/generic.py @@ -16,6 +16,7 @@ # under the License. import datetime +from datetime import timezone import numpy as np import pyarrow as pa @@ -26,29 +27,29 @@ def data(): - np.random.seed(1) + rng = np.random.default_rng(1) data = np.concatenate( [ - np.random.normal(0, 0.01, size=50), - np.random.normal(50, 0.01, size=50), + rng.normal(0, 0.01, size=50), + rng.normal(50, 0.01, size=50), ] ) return pa.array(data) def data_with_nans(): - np.random.seed(0) - data = np.random.normal(0, 0.01, size=50) - mask = np.random.randint(0, 2, size=50) + rng = np.random.default_rng(0) + data = rng.normal(0, 0.01, size=50) + mask = rng.normal(0, 2, size=50) data[mask == 0] = np.nan return data def data_datetime(f): data = [ - datetime.datetime.now(), - datetime.datetime.now() - datetime.timedelta(days=1), - datetime.datetime.now() + datetime.timedelta(days=1), + datetime.datetime.now(tz=timezone.utc), + datetime.datetime.now(tz=timezone.utc) - datetime.timedelta(days=1), + datetime.datetime.now(tz=timezone.utc) + datetime.timedelta(days=1), ] return pa.array(data, type=pa.timestamp(f), mask=np.array([False, True, False])) diff --git a/python/tests/test_aggregation.py b/python/tests/test_aggregation.py index 5ef46131b..61b1c7d80 100644 --- a/python/tests/test_aggregation.py +++ b/python/tests/test_aggregation.py @@ -66,7 +66,7 @@ def df_aggregate_100(): @pytest.mark.parametrize( - "agg_expr, calc_expected", + ("agg_expr", "calc_expected"), [ (f.avg(column("a")), lambda a, b, c, d: np.array(np.average(a))), ( @@ -114,7 +114,7 @@ def test_aggregation_stats(df, agg_expr, calc_expected): @pytest.mark.parametrize( - "agg_expr, expected, array_sort", + ("agg_expr", "expected", "array_sort"), [ (f.approx_distinct(column("b")), pa.array([2], type=pa.uint64()), False), ( @@ -182,12 +182,11 @@ def test_aggregation(df, agg_expr, expected, array_sort): agg_df.show() result = agg_df.collect()[0] - print(result) assert result.column(0) == expected @pytest.mark.parametrize( - "name,expr,expected", + ("name", "expr", "expected"), [ ( "approx_percentile_cont", @@ -299,7 +298,9 @@ def test_aggregate_100(df_aggregate_100, name, expr, expected): ] -@pytest.mark.parametrize("name,expr,result", data_test_bitwise_and_boolean_functions) +@pytest.mark.parametrize( + ("name", "expr", "result"), data_test_bitwise_and_boolean_functions +) def test_bit_and_bool_fns(df, name, expr, result): df = df.aggregate([], [expr.alias(name)]) @@ -311,7 +312,7 @@ def test_bit_and_bool_fns(df, name, expr, result): @pytest.mark.parametrize( - "name,expr,result", + ("name", "expr", "result"), [ ("first_value", f.first_value(column("a")), [0, 4]), ( @@ -361,7 +362,6 @@ def test_bit_and_bool_fns(df, name, expr, result): ), [8, 9], ), - ("first_value", f.first_value(column("a")), [0, 4]), ( "nth_value_ordered", f.nth_value(column("a"), 2, order_by=[column("a").sort(ascending=False)]), @@ -401,7 +401,7 @@ def test_first_last_value(df_partitioned, name, expr, result) -> None: @pytest.mark.parametrize( - "name,expr,result", + ("name", "expr", "result"), [ ("string_agg", f.string_agg(column("a"), ","), "one,two,three,two"), ("string_agg", f.string_agg(column("b"), ""), "03124"), diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py index 214f6b165..23b328458 100644 --- a/python/tests/test_catalog.py +++ b/python/tests/test_catalog.py @@ -19,6 +19,9 @@ import pytest +# Note we take in `database` as a variable even though we don't use +# it because that will cause the fixture to set up the context with +# the tables we need. def test_basic(ctx, database): with pytest.raises(KeyError): ctx.catalog("non-existent") @@ -26,10 +29,10 @@ def test_basic(ctx, database): default = ctx.catalog() assert default.names() == ["public"] - for database in [default.database("public"), default.database()]: - assert database.names() == {"csv1", "csv", "csv2"} + for db in [default.database("public"), default.database()]: + assert db.names() == {"csv1", "csv", "csv2"} - table = database.table("csv") + table = db.table("csv") assert table.kind == "physical" assert table.schema == pa.schema( [ diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 91046e6b8..7a0a7aa08 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -16,7 +16,6 @@ # under the License. import datetime as dt import gzip -import os import pathlib import pyarrow as pa @@ -45,7 +44,7 @@ def test_create_context_runtime_config_only(): SessionContext(runtime=RuntimeEnvBuilder()) -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_runtime_configs(tmp_path, path_to_str): path1 = tmp_path / "dir1" path2 = tmp_path / "dir2" @@ -62,7 +61,7 @@ def test_runtime_configs(tmp_path, path_to_str): assert db is not None -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_temporary_files(tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -79,14 +78,14 @@ def test_create_context_with_all_valid_args(): runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( SessionConfig() - .with_create_default_catalog_and_schema(True) + .with_create_default_catalog_and_schema(enabled=True) .with_default_catalog_and_schema("foo", "bar") .with_target_partitions(1) - .with_information_schema(True) - .with_repartition_joins(False) - .with_repartition_aggregations(False) - .with_repartition_windows(False) - .with_parquet_pruning(False) + .with_information_schema(enabled=True) + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) + .with_repartition_windows(enabled=False) + .with_parquet_pruning(enabled=False) ) ctx = SessionContext(config, runtime) @@ -167,7 +166,7 @@ def test_from_arrow_table(ctx): def record_batch_generator(num_batches: int): schema = pa.schema([("a", pa.int64()), ("b", pa.int64())]) - for i in range(num_batches): + for _i in range(num_batches): yield pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], schema=schema ) @@ -492,10 +491,10 @@ def test_table_not_found(ctx): def test_read_json(ctx): - path = os.path.dirname(os.path.abspath(__file__)) + path = pathlib.Path(__file__).parent.resolve() # Default - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = path / "data_test_context" / "data.json" df = ctx.read_json(test_data_path) result = df.collect() @@ -515,7 +514,7 @@ def test_read_json(ctx): assert result[0].schema == schema # File extension - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = path / "data_test_context" / "data.json" df = ctx.read_json(test_data_path, file_extension=".json") result = df.collect() @@ -524,15 +523,17 @@ def test_read_json(ctx): def test_read_json_compressed(ctx, tmp_path): - path = os.path.dirname(os.path.abspath(__file__)) - test_data_path = os.path.join(path, "data_test_context", "data.json") + path = pathlib.Path(__file__).parent.resolve() + test_data_path = path / "data_test_context" / "data.json" # File compression type gzip_path = tmp_path / "data.json.gz" - with open(test_data_path, "rb") as csv_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(csv_file) + with ( + pathlib.Path.open(test_data_path, "rb") as csv_file, + gzip.open(gzip_path, "wb") as gzipped_file, + ): + gzipped_file.writelines(csv_file) df = ctx.read_json(gzip_path, file_extension=".gz", file_compression_type="gz") result = df.collect() @@ -563,14 +564,16 @@ def test_read_csv_list(ctx): def test_read_csv_compressed(ctx, tmp_path): - test_data_path = "testing/data/csv/aggregate_test_100.csv" + test_data_path = pathlib.Path("testing/data/csv/aggregate_test_100.csv") # File compression type gzip_path = tmp_path / "aggregate_test_100.csv.gz" - with open(test_data_path, "rb") as csv_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(csv_file) + with ( + pathlib.Path.open(test_data_path, "rb") as csv_file, + gzip.open(gzip_path, "wb") as gzipped_file, + ): + gzipped_file.writelines(csv_file) csv_df = ctx.read_csv(gzip_path, file_extension=".gz", file_compression_type="gz") csv_df.select(column("c1")).show() @@ -603,7 +606,7 @@ def test_create_sql_options(): def test_sql_with_options_no_ddl(ctx): sql = "CREATE TABLE IF NOT EXISTS valuetable AS VALUES(1,'HELLO'),(12,'DATAFUSION')" ctx.sql(sql) - options = SQLOptions().with_allow_ddl(False) + options = SQLOptions().with_allow_ddl(allow=False) with pytest.raises(Exception, match="DDL"): ctx.sql_with_options(sql, options=options) @@ -618,7 +621,7 @@ def test_sql_with_options_no_dml(ctx): ctx.register_dataset(table_name, dataset) sql = f'INSERT INTO "{table_name}" VALUES (1, 2), (2, 3);' ctx.sql(sql) - options = SQLOptions().with_allow_dml(False) + options = SQLOptions().with_allow_dml(allow=False) with pytest.raises(Exception, match="DML"): ctx.sql_with_options(sql, options=options) @@ -626,6 +629,6 @@ def test_sql_with_options_no_dml(ctx): def test_sql_with_options_no_statements(ctx): sql = "SET time zone = 1;" ctx.sql(sql) - options = SQLOptions().with_allow_statements(False) + options = SQLOptions().with_allow_statements(allow=False) with pytest.raises(Exception, match="SetVariable"): ctx.sql_with_options(sql, options=options) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index c636e896a..d084f12dd 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -339,7 +339,7 @@ def test_join(): # Verify we don't make a breaking change to pre-43.0.0 # where users would pass join_keys as a positional argument - df2 = df.join(df1, (["a"], ["a"]), how="inner") # type: ignore + df2 = df.join(df1, (["a"], ["a"]), how="inner") df2.show() df2 = df2.sort(column("l.a")) table = pa.Table.from_batches(df2.collect()) @@ -375,17 +375,17 @@ def test_join_invalid_params(): with pytest.raises( ValueError, match=r"`left_on` or `right_on` should not provided with `on`" ): - df2 = df.join(df1, on="a", how="inner", right_on="test") # type: ignore + df2 = df.join(df1, on="a", how="inner", right_on="test") with pytest.raises( ValueError, match=r"`left_on` and `right_on` should both be provided." ): - df2 = df.join(df1, left_on="a", how="inner") # type: ignore + df2 = df.join(df1, left_on="a", how="inner") with pytest.raises( ValueError, match=r"either `on` or `left_on` and `right_on` should be provided." ): - df2 = df.join(df1, how="inner") # type: ignore + df2 = df.join(df1, how="inner") def test_join_on(): @@ -567,7 +567,7 @@ def test_distinct(): ] -@pytest.mark.parametrize("name,expr,result", data_test_window_functions) +@pytest.mark.parametrize(("name", "expr", "result"), data_test_window_functions) def test_window_functions(partitioned_df, name, expr, result): df = partitioned_df.select( column("a"), column("b"), column("c"), f.alias(expr, name) @@ -731,7 +731,7 @@ def test_execution_plan(aggregate_df): plan = aggregate_df.execution_plan() expected = ( - "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n" # noqa: E501 + "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n" ) assert expected == plan.display() @@ -756,7 +756,7 @@ def test_execution_plan(aggregate_df): ctx = SessionContext() rows_returned = 0 - for idx in range(0, plan.partition_count): + for idx in range(plan.partition_count): stream = ctx.execute(plan, idx) try: batch = stream.next() @@ -885,7 +885,7 @@ def test_union_distinct(ctx): ) df_c = ctx.create_dataframe([[batch]]).sort(column("a")) - df_a_u_b = df_a.union(df_b, True).sort(column("a")) + df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a")) assert df_c.collect() == df_a_u_b.collect() assert df_c.collect() == df_a_u_b.collect() @@ -954,8 +954,6 @@ def test_to_arrow_table(df): def test_execute_stream(df): stream = df.execute_stream() - for s in stream: - print(type(s)) assert all(batch is not None for batch in stream) assert not list(stream) # after one iteration the generator must be exhausted @@ -969,7 +967,7 @@ def test_execute_stream_to_arrow_table(df, schema): (batch.to_pyarrow() for batch in stream), schema=df.schema() ) else: - pyarrow_table = pa.Table.from_batches((batch.to_pyarrow() for batch in stream)) + pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream) assert isinstance(pyarrow_table, pa.Table) assert pyarrow_table.shape == (3, 3) @@ -1033,7 +1031,7 @@ def test_describe(df): } -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_write_csv(ctx, df, tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -1046,7 +1044,7 @@ def test_write_csv(ctx, df, tmp_path, path_to_str): assert result == expected -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_write_json(ctx, df, tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -1059,7 +1057,7 @@ def test_write_json(ctx, df, tmp_path, path_to_str): assert result == expected -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_write_parquet(df, tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -1071,7 +1069,7 @@ def test_write_parquet(df, tmp_path, path_to_str): @pytest.mark.parametrize( - "compression, compression_level", + ("compression", "compression_level"), [("gzip", 6), ("brotli", 7), ("zstd", 15)], ) def test_write_compressed_parquet(df, tmp_path, compression, compression_level): @@ -1082,7 +1080,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level): ) # test that the actual compression scheme is the one written - for root, dirs, files in os.walk(path): + for _root, _dirs, files in os.walk(path): for file in files: if file.endswith(".parquet"): metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict() @@ -1097,7 +1095,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level): @pytest.mark.parametrize( - "compression, compression_level", + ("compression", "compression_level"), [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)], ) def test_write_compressed_parquet_wrong_compression_level( @@ -1152,7 +1150,7 @@ def test_dataframe_export(df) -> None: table = pa.table(df, schema=desired_schema) assert table.num_columns == 1 assert table.num_rows == 3 - for i in range(0, 3): + for i in range(3): assert table[0][i].as_py() is None # Expect an error when we cannot convert schema @@ -1186,8 +1184,8 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: result = df.to_pydict() assert result["a"] == [1, 2, 3] - assert result["string_col"] == ["string data" for _i in range(0, 3)] - assert result["new_col"] == [3 for _i in range(0, 3)] + assert result["string_col"] == ["string data" for _i in range(3)] + assert result["new_col"] == [3 for _i in range(3)] def test_dataframe_repr_html(df) -> None: diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 354c7e180..926e69845 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -85,18 +85,14 @@ def test_limit(test_ctx): plan = plan.to_variant() assert isinstance(plan, Limit) - # TODO: Upstream now has expressions for skip and fetch - # REF: https://github.com/apache/datafusion/pull/12836 - # assert plan.skip() == 0 + assert "Skip: None" in str(plan) df = test_ctx.sql("select c1 from test LIMIT 10 OFFSET 5") plan = df.logical_plan() plan = plan.to_variant() assert isinstance(plan, Limit) - # TODO: Upstream now has expressions for skip and fetch - # REF: https://github.com/apache/datafusion/pull/12836 - # assert plan.skip() == 5 + assert "Skip: Some(Literal(Int64(5)))" in str(plan) def test_aggregate_query(test_ctx): @@ -165,6 +161,7 @@ def traverse_logical_plan(plan): res = traverse_logical_plan(input_plan) if res is not None: return res + return None ctx = SessionContext() data = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]} @@ -176,7 +173,7 @@ def traverse_logical_plan(plan): assert variant.expr().to_variant().qualified_name() == "table1.name" assert ( str(variant.list()) - == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' + == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' # noqa: E501 ) assert not variant.negated() diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index fca05bb8f..ed88a16e3 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import math -from datetime import datetime +from datetime import datetime, timezone import numpy as np import pyarrow as pa @@ -25,6 +25,8 @@ np.seterr(invalid="ignore") +DEFAULT_TZ = timezone.utc + @pytest.fixture def df(): @@ -37,9 +39,9 @@ def df(): pa.array(["hello ", " world ", " !"], type=pa.string_view()), pa.array( [ - datetime(2022, 12, 31), - datetime(2027, 6, 26), - datetime(2020, 7, 2), + datetime(2022, 12, 31, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 26, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 2, tzinfo=DEFAULT_TZ), ] ), pa.array([False, True, True]), @@ -221,12 +223,12 @@ def py_indexof(arr, v): def py_arr_remove(arr, v, n=None): new_arr = arr[:] found = 0 - while found != n: - try: + try: + while found != n: new_arr.remove(v) found += 1 - except ValueError: - break + except ValueError: + pass return new_arr @@ -234,13 +236,13 @@ def py_arr_remove(arr, v, n=None): def py_arr_replace(arr, from_, to, n=None): new_arr = arr[:] found = 0 - while found != n: - try: + try: + while found != n: idx = new_arr.index(from_) new_arr[idx] = to found += 1 - except ValueError: - break + except ValueError: + pass return new_arr @@ -268,266 +270,266 @@ def py_flatten(arr): @pytest.mark.parametrize( ("stmt", "py_expr"), [ - [ + ( lambda col: f.array_append(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_push_back(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_append(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_push_back(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_concat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.array_cat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.list_cat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.list_concat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.array_dims(col), lambda data: [[len(r)] for r in data], - ], - [ + ), + ( lambda col: f.array_distinct(col), lambda data: [list(set(r)) for r in data], - ], - [ + ), + ( lambda col: f.list_distinct(col), lambda data: [list(set(r)) for r in data], - ], - [ + ), + ( lambda col: f.list_dims(col), lambda data: [[len(r)] for r in data], - ], - [ + ), + ( lambda col: f.array_element(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.array_empty(col), lambda data: [len(r) == 0 for r in data], - ], - [ + ), + ( lambda col: f.empty(col), lambda data: [len(r) == 0 for r in data], - ], - [ + ), + ( lambda col: f.array_extract(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.list_element(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.list_extract(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.array_length(col), lambda data: [len(r) for r in data], - ], - [ + ), + ( lambda col: f.list_length(col), lambda data: [len(r) for r in data], - ], - [ + ), + ( lambda col: f.array_has(col, literal(1.0)), lambda data: [1.0 in r for r in data], - ], - [ + ), + ( lambda col: f.array_has_all( col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) ), lambda data: [np.all([v in r for v in [1.0, 3.0, 5.0]]) for r in data], - ], - [ + ), + ( lambda col: f.array_has_any( col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) ), lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data], - ], - [ + ), + ( lambda col: f.array_position(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.array_indexof(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.list_position(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.list_indexof(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.array_positions(col, literal(1.0)), lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data], - ], - [ + ), + ( lambda col: f.list_positions(col, literal(1.0)), lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data], - ], - [ + ), + ( lambda col: f.array_ndims(col), lambda data: [np.array(r).ndim for r in data], - ], - [ + ), + ( lambda col: f.list_ndims(col), lambda data: [np.array(r).ndim for r in data], - ], - [ + ), + ( lambda col: f.array_prepend(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_push_front(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_prepend(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_push_front(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_pop_back(col), lambda data: [arr[:-1] for arr in data], - ], - [ + ), + ( lambda col: f.array_pop_front(col), lambda data: [arr[1:] for arr in data], - ], - [ + ), + ( lambda col: f.array_remove(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.list_remove(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.array_remove_n(col, literal(3.0), literal(2)), lambda data: [py_arr_remove(arr, 3.0, 2) for arr in data], - ], - [ + ), + ( lambda col: f.list_remove_n(col, literal(3.0), literal(2)), lambda data: [py_arr_remove(arr, 3.0, 2) for arr in data], - ], - [ + ), + ( lambda col: f.array_remove_all(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_remove_all(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_repeat(col, literal(2)), lambda data: [[arr] * 2 for arr in data], - ], - [ + ), + ( lambda col: f.list_repeat(col, literal(2)), lambda data: [[arr] * 2 for arr in data], - ], - [ + ), + ( lambda col: f.array_replace(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.list_replace(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.array_replace_n(col, literal(3.0), literal(4.0), literal(1)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.list_replace_n(col, literal(3.0), literal(4.0), literal(2)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 2) for arr in data], - ], - [ + ), + ( lambda col: f.array_replace_all(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_replace_all(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_sort(col, descending=True, null_first=True), lambda data: [np.sort(arr)[::-1] for arr in data], - ], - [ + ), + ( lambda col: f.list_sort(col, descending=False, null_first=False), lambda data: [np.sort(arr) for arr in data], - ], - [ + ), + ( lambda col: f.array_slice(col, literal(2), literal(4)), lambda data: [arr[1:4] for arr in data], - ], + ), pytest.param( lambda col: f.list_slice(col, literal(-1), literal(2)), lambda data: [arr[-1:2] for arr in data], ), - [ + ( lambda col: f.array_intersect(col, literal([3.0, 4.0])), lambda data: [np.intersect1d(arr, [3.0, 4.0]) for arr in data], - ], - [ + ), + ( lambda col: f.list_intersect(col, literal([3.0, 4.0])), lambda data: [np.intersect1d(arr, [3.0, 4.0]) for arr in data], - ], - [ + ), + ( lambda col: f.array_union(col, literal([12.0, 999.0])), lambda data: [np.union1d(arr, [12.0, 999.0]) for arr in data], - ], - [ + ), + ( lambda col: f.list_union(col, literal([12.0, 999.0])), lambda data: [np.union1d(arr, [12.0, 999.0]) for arr in data], - ], - [ + ), + ( lambda col: f.array_except(col, literal([3.0])), lambda data: [np.setdiff1d(arr, [3.0]) for arr in data], - ], - [ + ), + ( lambda col: f.list_except(col, literal([3.0])), lambda data: [np.setdiff1d(arr, [3.0]) for arr in data], - ], - [ + ), + ( lambda col: f.array_resize(col, literal(10), literal(0.0)), lambda data: [py_arr_resize(arr, 10, 0.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_resize(col, literal(10), literal(0.0)), lambda data: [py_arr_resize(arr, 10, 0.0) for arr in data], - ], - [ + ), + ( lambda col: f.range(literal(1), literal(5), literal(2)), lambda data: [np.arange(1, 5, 2)], - ], + ), ], ) def test_array_functions(stmt, py_expr): @@ -611,22 +613,22 @@ def test_make_array_functions(make_func): @pytest.mark.parametrize( ("stmt", "py_expr"), [ - [ + ( f.array_to_string(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], - [ + ), + ( f.array_join(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], - [ + ), + ( f.list_to_string(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], - [ + ), + ( f.list_join(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], + ), ], ) def test_array_function_obj_tests(stmt, py_expr): @@ -640,7 +642,7 @@ def test_array_function_obj_tests(stmt, py_expr): @pytest.mark.parametrize( - "function, expected_result", + ("function", "expected_result"), [ ( f.ascii(column("a")), @@ -894,54 +896,72 @@ def test_temporal_functions(df): assert result.column(0) == pa.array([12, 6, 7], type=pa.int32()) assert result.column(1) == pa.array([2022, 2027, 2020], type=pa.int32()) assert result.column(2) == pa.array( - [datetime(2022, 12, 1), datetime(2027, 6, 1), datetime(2020, 7, 1)], - type=pa.timestamp("us"), + [ + datetime(2022, 12, 1, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 1, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 1, tzinfo=DEFAULT_TZ), + ], + type=pa.timestamp("ns", tz=DEFAULT_TZ), ) assert result.column(3) == pa.array( - [datetime(2022, 12, 31), datetime(2027, 6, 26), datetime(2020, 7, 2)], - type=pa.timestamp("us"), + [ + datetime(2022, 12, 31, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 26, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 2, tzinfo=DEFAULT_TZ), + ], + type=pa.timestamp("ns", tz=DEFAULT_TZ), ) assert result.column(4) == pa.array( [ - datetime(2022, 12, 30, 23, 47, 30), - datetime(2027, 6, 25, 23, 47, 30), - datetime(2020, 7, 1, 23, 47, 30), + datetime(2022, 12, 30, 23, 47, 30, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 25, 23, 47, 30, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 1, 23, 47, 30, tzinfo=DEFAULT_TZ), ], - type=pa.timestamp("ns"), + type=pa.timestamp("ns", tz=DEFAULT_TZ), ) assert result.column(5) == pa.array( - [datetime(2023, 1, 10, 20, 52, 54)] * 3, type=pa.timestamp("s") + [datetime(2023, 1, 10, 20, 52, 54, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("s"), ) assert result.column(6) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) assert result.column(7) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") + [datetime(2023, 9, 7, 5, 6, 14, tzinfo=DEFAULT_TZ)] * 3, type=pa.timestamp("s") ) assert result.column(8) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") + [datetime(2023, 9, 7, 5, 6, 14, 523000, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ms"), ) assert result.column(9) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("us"), ) assert result.column(10) == pa.array([31, 26, 2], type=pa.int32()) assert result.column(11) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) assert result.column(12) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") + [datetime(2023, 9, 7, 5, 6, 14, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("s"), ) assert result.column(13) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") + [datetime(2023, 9, 7, 5, 6, 14, 523000, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ms"), ) assert result.column(14) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("us"), ) assert result.column(15) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) assert result.column(16) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) @@ -1057,7 +1077,7 @@ def test_regr_funcs_sql_2(): @pytest.mark.parametrize( - "func, expected", + ("func", "expected"), [ pytest.param(f.regr_slope(column("c2"), column("c1")), [4.6], id="regr_slope"), pytest.param( @@ -1160,7 +1180,7 @@ def test_binary_string_functions(df): @pytest.mark.parametrize( - "python_datatype, name, expected", + ("python_datatype", "name", "expected"), [ pytest.param(bool, "e", pa.bool_(), id="bool"), pytest.param(int, "b", pa.int64(), id="int"), @@ -1179,7 +1199,7 @@ def test_cast(df, python_datatype, name: str, expected): @pytest.mark.parametrize( - "negated, low, high, expected", + ("negated", "low", "high", "expected"), [ pytest.param(False, 3, 5, {"filtered": [4, 5]}), pytest.param(False, 4, 5, {"filtered": [4, 5]}), diff --git a/python/tests/test_imports.py b/python/tests/test_imports.py index 0c155cbde..9ef7ed89a 100644 --- a/python/tests/test_imports.py +++ b/python/tests/test_imports.py @@ -169,14 +169,15 @@ def test_class_module_is_datafusion(): def test_import_from_functions_submodule(): - from datafusion.functions import abs, sin # noqa + from datafusion.functions import abs as df_abs + from datafusion.functions import sin - assert functions.abs is abs + assert functions.abs is df_abs assert functions.sin is sin msg = "cannot import name 'foobar' from 'datafusion.functions'" with pytest.raises(ImportError, match=msg): - from datafusion.functions import foobar # noqa + from datafusion.functions import foobar # noqa: F401 def test_classes_are_inheritable(): diff --git a/python/tests/test_input.py b/python/tests/test_input.py index 806471357..4663f6148 100644 --- a/python/tests/test_input.py +++ b/python/tests/test_input.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import os +import pathlib from datafusion.input.location import LocationInputPlugin @@ -23,10 +23,10 @@ def test_location_input(): location_input = LocationInputPlugin() - cwd = os.getcwd() - input_file = cwd + "/testing/data/parquet/generated_simple_numerics/blogs.parquet" + cwd = pathlib.Path.cwd() + input_file = cwd / "testing/data/parquet/generated_simple_numerics/blogs.parquet" table_name = "blog" - tbl = location_input.build_table(input_file, table_name) - assert "blog" == tbl.name - assert 3 == len(tbl.columns) + tbl = location_input.build_table(str(input_file), table_name) + assert tbl.name == "blog" + assert len(tbl.columns) == 3 assert "blogs.parquet" in tbl.filepaths[0] diff --git a/python/tests/test_io.py b/python/tests/test_io.py index 21ad188ee..7ca509689 100644 --- a/python/tests/test_io.py +++ b/python/tests/test_io.py @@ -14,8 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import os -import pathlib +from pathlib import Path import pyarrow as pa from datafusion import column @@ -23,10 +22,10 @@ def test_read_json_global_ctx(ctx): - path = os.path.dirname(os.path.abspath(__file__)) + path = Path(__file__).parent.resolve() # Default - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = Path(path) / "data_test_context" / "data.json" df = read_json(test_data_path) result = df.collect() @@ -46,7 +45,7 @@ def test_read_json_global_ctx(ctx): assert result[0].schema == schema # File extension - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = Path(path) / "data_test_context" / "data.json" df = read_json(test_data_path, file_extension=".json") result = df.collect() @@ -59,7 +58,7 @@ def test_read_parquet_global(): parquet_df.show() assert parquet_df is not None - path = pathlib.Path.cwd() / "parquet/data/alltypes_plain.parquet" + path = Path.cwd() / "parquet/data/alltypes_plain.parquet" parquet_df = read_parquet(path=path) assert parquet_df is not None @@ -90,6 +89,6 @@ def test_read_avro(): avro_df.show() assert avro_df is not None - path = pathlib.Path.cwd() / "testing/data/avro/alltypes_plain.avro" + path = Path.cwd() / "testing/data/avro/alltypes_plain.avro" avro_df = read_avro(path=path) assert avro_df is not None diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 862f745bf..b6348e3a0 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import gzip -import os +from pathlib import Path import numpy as np import pyarrow as pa @@ -47,9 +47,8 @@ def test_register_csv(ctx, tmp_path): ) write_csv(table, path) - with open(path, "rb") as csv_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(csv_file) + with Path.open(path, "rb") as csv_file, gzip.open(gzip_path, "wb") as gzipped_file: + gzipped_file.writelines(csv_file) ctx.register_csv("csv", path) ctx.register_csv("csv1", str(path)) @@ -158,7 +157,7 @@ def test_register_parquet(ctx, tmp_path): assert result.to_pydict() == {"cnt": [100]} -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_register_parquet_partitioned(ctx, tmp_path, path_to_str): dir_root = tmp_path / "dataset_parquet_partitioned" dir_root.mkdir(exist_ok=False) @@ -194,7 +193,7 @@ def test_register_parquet_partitioned(ctx, tmp_path, path_to_str): assert dict(zip(rd["grp"], rd["cnt"])) == {"a": 3, "b": 1} -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_register_dataset(ctx, tmp_path, path_to_str): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) path = str(path) if path_to_str else path @@ -209,13 +208,15 @@ def test_register_dataset(ctx, tmp_path, path_to_str): def test_register_json(ctx, tmp_path): - path = os.path.dirname(os.path.abspath(__file__)) - test_data_path = os.path.join(path, "data_test_context", "data.json") + path = Path(__file__).parent.resolve() + test_data_path = Path(path) / "data_test_context" / "data.json" gzip_path = tmp_path / "data.json.gz" - with open(test_data_path, "rb") as json_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(json_file) + with ( + Path.open(test_data_path, "rb") as json_file, + gzip.open(gzip_path, "wb") as gzipped_file, + ): + gzipped_file.writelines(json_file) ctx.register_json("json", test_data_path) ctx.register_json("json1", str(test_data_path)) @@ -470,16 +471,18 @@ def test_simple_select(ctx, tmp_path, arr): # In DF 43.0.0 we now default to having BinaryView and StringView # so the array that is saved to the parquet is slightly different # than the array read. Convert to values for comparison. - if isinstance(result, pa.BinaryViewArray) or isinstance(result, pa.StringViewArray): + if isinstance(result, (pa.BinaryViewArray, pa.StringViewArray)): arr = arr.tolist() result = result.tolist() np.testing.assert_equal(result, arr) -@pytest.mark.parametrize("file_sort_order", (None, [[col("int").sort(True, True)]])) -@pytest.mark.parametrize("pass_schema", (True, False)) -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize( + "file_sort_order", [None, [[col("int").sort(ascending=True, nulls_first=True)]]] +) +@pytest.mark.parametrize("pass_schema", [True, False]) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_register_listing_table( ctx, tmp_path, pass_schema, file_sort_order, path_to_str ): @@ -528,7 +531,7 @@ def test_register_listing_table( assert dict(zip(rd["grp"], rd["count"])) == {"a": 5, "b": 2} result = ctx.sql( - "SELECT grp, COUNT(*) AS count FROM my_table WHERE date_id=20201005 GROUP BY grp" + "SELECT grp, COUNT(*) AS count FROM my_table WHERE date_id=20201005 GROUP BY grp" # noqa: E501 ).collect() result = pa.Table.from_batches(result) diff --git a/python/tests/test_store.py b/python/tests/test_store.py index 53ffc3acf..ac9af98f3 100644 --- a/python/tests/test_store.py +++ b/python/tests/test_store.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import os +from pathlib import Path import pytest from datafusion import SessionContext @@ -23,17 +23,16 @@ @pytest.fixture def ctx(): - ctx = SessionContext() - return ctx + return SessionContext() def test_read_parquet(ctx): ctx.register_parquet( "test", - f"file://{os.getcwd()}/parquet/data/alltypes_plain.parquet", - [], - True, - ".parquet", + f"file://{Path.cwd()}/parquet/data/alltypes_plain.parquet", + table_partition_cols=[], + parquet_pruning=True, + file_extension=".parquet", ) df = ctx.sql("SELECT * FROM test") assert isinstance(df.collect(), list) diff --git a/python/tests/test_substrait.py b/python/tests/test_substrait.py index feada7cde..f367a447d 100644 --- a/python/tests/test_substrait.py +++ b/python/tests/test_substrait.py @@ -50,7 +50,7 @@ def test_substrait_serialization(ctx): substrait_plan = ss.Producer.to_substrait_plan(df.logical_plan(), ctx) -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_substrait_file_serialization(ctx, tmp_path, path_to_str): batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index 97cf81f3c..453ff6f4f 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -17,8 +17,6 @@ from __future__ import annotations -from typing import List - import pyarrow as pa import pyarrow.compute as pc import pytest @@ -31,7 +29,7 @@ class Summarize(Accumulator): def __init__(self, initial_value: float = 0.0): self._sum = pa.scalar(initial_value) - def state(self) -> List[pa.Scalar]: + def state(self) -> list[pa.Scalar]: return [self._sum] def update(self, values: pa.Array) -> None: @@ -39,7 +37,7 @@ def update(self, values: pa.Array) -> None: # This breaks on `None` self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - def merge(self, states: List[pa.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: # Not nice since pyarrow scalars can't be summed yet. # This breaks on `None` self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) @@ -56,7 +54,7 @@ class MissingMethods(Accumulator): def __init__(self): self._sum = pa.scalar(0) - def state(self) -> List[pa.Scalar]: + def state(self) -> list[pa.Scalar]: return [self._sum] @@ -86,7 +84,7 @@ def test_errors(df): "evaluate, merge, update)" ) with pytest.raises(Exception, match=msg): - accum = udaf( # noqa F841 + accum = udaf( # noqa: F841 MissingMethods, pa.int64(), pa.int64(), diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 2fea34aa3..3d6dcf9d8 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -298,7 +298,7 @@ def test_udwf_errors(df): ] -@pytest.mark.parametrize("name,expr,expected", data_test_udwf_functions) +@pytest.mark.parametrize(("name", "expr", "expected"), data_test_udwf_functions) def test_udwf_functions(df, name, expr, expected): df = df.select("a", "b", f.round(expr, lit(3)).alias(name)) diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index ac064ba95..d7f6f6e35 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -19,6 +19,7 @@ import datafusion.functions import datafusion.object_store import datafusion.substrait +import pytest # EnumType introduced in 3.11. 3.10 and prior it was called EnumMeta. try: @@ -41,10 +42,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: internal_attr = getattr(internal_obj, attr) wrapped_attr = getattr(wrapped_obj, attr) - if internal_attr is not None: - if wrapped_attr is None: - print("Missing attribute: ", attr) - assert False + if internal_attr is not None and wrapped_attr is None: + pytest.fail(f"Missing attribute: {attr}") if attr in ["__self__", "__class__"]: continue From 3dcf7c7e5c0af0eb3c5e3bdf9c6e33fd4541b070 Mon Sep 17 00:00:00 2001 From: jsai28 <54253219+jsai28@users.noreply.github.com> Date: Thu, 13 Mar 2025 04:09:03 -0600 Subject: [PATCH 110/248] feat/making global context accessible for users (#1060) * Rename _global_ctx to global_ctx * Add global context to python wrapper code * Update context.py * singleton for global context * formatting * remove udf from import * remove _global_instance * formatting * formatting * unnecessary test * fix test_io.py * ran ruff * ran ruff format --- python/datafusion/context.py | 12 +++++++ python/datafusion/io.py | 63 ++++++++++++++++-------------------- python/tests/test_context.py | 18 +++++++++++ src/context.rs | 2 +- 4 files changed, 58 insertions(+), 37 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 0ab1a908a..58ad9a943 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -496,6 +496,18 @@ def __init__( self.ctx = SessionContextInternal(config, runtime) + @classmethod + def global_ctx(cls) -> SessionContext: + """Retrieve the global context as a `SessionContext` wrapper. + + Returns: + A `SessionContext` object that wraps the global `SessionContextInternal`. + """ + internal_ctx = SessionContextInternal.global_ctx() + wrapper = cls() + wrapper.ctx = internal_ctx + return wrapper + def enable_url_table(self) -> SessionContext: """Control if local files can be queried as tables. diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 3e39703e3..ef5ebf96f 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -21,10 +21,9 @@ from typing import TYPE_CHECKING +from datafusion.context import SessionContext from datafusion.dataframe import DataFrame -from ._internal import SessionContext as SessionContextInternal - if TYPE_CHECKING: import pathlib @@ -68,16 +67,14 @@ def read_parquet( """ if table_partition_cols is None: table_partition_cols = [] - return DataFrame( - SessionContextInternal._global_ctx().read_parquet( - str(path), - table_partition_cols, - parquet_pruning, - file_extension, - skip_metadata, - schema, - file_sort_order, - ) + return SessionContext.global_ctx().read_parquet( + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, ) @@ -110,15 +107,13 @@ def read_json( """ if table_partition_cols is None: table_partition_cols = [] - return DataFrame( - SessionContextInternal._global_ctx().read_json( - str(path), - schema, - schema_infer_max_records, - file_extension, - table_partition_cols, - file_compression_type, - ) + return SessionContext.global_ctx().read_json( + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, ) @@ -161,17 +156,15 @@ def read_csv( path = [str(p) for p in path] if isinstance(path, list) else str(path) - return DataFrame( - SessionContextInternal._global_ctx().read_csv( - path, - schema, - has_header, - delimiter, - schema_infer_max_records, - file_extension, - table_partition_cols, - file_compression_type, - ) + return SessionContext.global_ctx().read_csv( + path, + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, ) @@ -198,8 +191,6 @@ def read_avro( """ if file_partition_cols is None: file_partition_cols = [] - return DataFrame( - SessionContextInternal._global_ctx().read_avro( - str(path), schema, file_partition_cols, file_extension - ) + return SessionContext.global_ctx().read_avro( + str(path), schema, file_partition_cols, file_extension ) diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 7a0a7aa08..4a15ac9cf 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -632,3 +632,21 @@ def test_sql_with_options_no_statements(ctx): options = SQLOptions().with_allow_statements(allow=False) with pytest.raises(Exception, match="SetVariable"): ctx.sql_with_options(sql, options=options) + + +@pytest.fixture +def batch(): + return pa.RecordBatch.from_arrays( + [pa.array([4, 5, 6])], + names=["a"], + ) + + +def test_create_dataframe_with_global_ctx(batch): + ctx = SessionContext.global_ctx() + + df = ctx.create_dataframe([[batch]]) + + result = df.collect()[0].column(0) + + assert result == pa.array([4, 5, 6]) diff --git a/src/context.rs b/src/context.rs index 9ba87eb8a..0db0f4d7e 100644 --- a/src/context.rs +++ b/src/context.rs @@ -308,7 +308,7 @@ impl PySessionContext { #[classmethod] #[pyo3(signature = ())] - fn _global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { + fn global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { Ok(Self { ctx: get_global_ctx().clone(), }) From 55141bad7c2270c14742e962d8bab1d4f1be27f5 Mon Sep 17 00:00:00 2001 From: Spaarsh <67336892+Spaarsh@users.noreply.github.com> Date: Fri, 14 Mar 2025 18:26:31 +0530 Subject: [PATCH 111/248] Renaming Internal Structs (#1059) * Renamed Expr to RawExpr * Fixed CI test for exported classes to include RawExpr as well * Fixed CI test for exported classes to check if Expr class covers RawExpr * Generalized Raw* class checking * fixes * fixes * fixed the CI test to not look for Raw classes in the datafusion module * Add additional text to unit test describing operation and ensure wrapped Raw classes are checked * New ruff rule on main * Resolve ruff errors --------- Co-authored-by: Tim Saucer --- python/datafusion/expr.py | 8 ++-- python/tests/test_wrapper_coverage.py | 55 +++++++++++++++++++-------- src/expr.rs | 2 +- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 702f75aed..77b6c272d 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -193,7 +193,7 @@ class Expr: :ref:`Expressions` in the online documentation for more information. """ - def __init__(self, expr: expr_internal.Expr) -> None: + def __init__(self, expr: expr_internal.RawExpr) -> None: """This constructor should not be called by the end user.""" self.expr = expr @@ -383,7 +383,7 @@ def literal(value: Any) -> Expr: value = pa.scalar(value, type=pa.string_view()) if not isinstance(value, pa.Scalar): value = pa.scalar(value) - return Expr(expr_internal.Expr.literal(value)) + return Expr(expr_internal.RawExpr.literal(value)) @staticmethod def string_literal(value: str) -> Expr: @@ -398,13 +398,13 @@ def string_literal(value: str) -> Expr: """ if isinstance(value, str): value = pa.scalar(value, type=pa.string()) - return Expr(expr_internal.Expr.literal(value)) + return Expr(expr_internal.RawExpr.literal(value)) return Expr.literal(value) @staticmethod def column(value: str) -> Expr: """Creates a new expression representing a column.""" - return Expr(expr_internal.Expr.column(value)) + return Expr(expr_internal.RawExpr.column(value)) def alias(self, name: str) -> Expr: """Assign a name to the expression.""" diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index d7f6f6e35..a2de2d32b 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -28,37 +28,62 @@ from enum import EnumMeta as EnumType -def missing_exports(internal_obj, wrapped_obj) -> None: - # Special case enums - just make sure they exist since dir() - # and other functions get overridden. +def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 + """ + Identify if any of the rust exposted structs or functions do not have wrappers. + + Special handling for: + - Raw* classes: Internal implementation details that shouldn't be exposed + - _global_ctx: Internal implementation detail + - __self__, __class__: Python special attributes + """ + # Special case enums - EnumType overrides a some of the internal functions, + # so check all of the values exist and move on if isinstance(wrapped_obj, EnumType): + expected_values = [v for v in dir(internal_obj) if not v.startswith("__")] + for value in expected_values: + assert value in dir(wrapped_obj) return - for attr in dir(internal_obj): - if attr in ["_global_ctx"]: - continue - assert attr in dir(wrapped_obj) + for internal_attr_name in dir(internal_obj): + wrapped_attr_name = internal_attr_name.removeprefix("Raw") + assert wrapped_attr_name in dir(wrapped_obj) - internal_attr = getattr(internal_obj, attr) - wrapped_attr = getattr(wrapped_obj, attr) + internal_attr = getattr(internal_obj, internal_attr_name) + wrapped_attr = getattr(wrapped_obj, wrapped_attr_name) - if internal_attr is not None and wrapped_attr is None: - pytest.fail(f"Missing attribute: {attr}") + # There are some auto generated attributes that can be None, such as + # __kwdefaults__ and __doc__. As long as these are None on the internal + # object, it's okay to skip them. However if they do exist on the internal + # object they must also exist on the wrapped object. + if internal_attr is not None: + if wrapped_attr is None: + pytest.fail(f"Missing attribute: {internal_attr_name}") - if attr in ["__self__", "__class__"]: + if internal_attr_name in ["__self__", "__class__"]: continue + if isinstance(internal_attr, list): assert isinstance(wrapped_attr, list) + + # We have cases like __all__ that are a list and we want to be certain that + # every value in the list in the internal object is also in the wrapper list for val in internal_attr: - assert val in wrapped_attr + if isinstance(val, str) and val.startswith("Raw"): + assert val[3:] in wrapped_attr + else: + assert val in wrapped_attr elif hasattr(internal_attr, "__dict__"): + # Check all submodules recursively missing_exports(internal_attr, wrapped_attr) def test_datafusion_missing_exports() -> None: """Check for any missing python exports. - This test verifies that every exposed class, attribute, and function in - the internal (pyo3) module is also exposed in our python wrappers. + This test verifies that every exposed class, attribute, + and function in the internal (pyo3) module - datafusion._internal + is also exposed in our python wrappers - datafusion - + i.e., the ones exposed to the public. """ missing_exports(datafusion._internal, datafusion) diff --git a/src/expr.rs b/src/expr.rs index e750be6a4..d3c528eb4 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -101,7 +101,7 @@ pub mod window; use sort_expr::{to_sort_expressions, PySortExpr}; /// A PyExpr that can be used on a DataFrame -#[pyclass(name = "Expr", module = "datafusion.expr", subclass)] +#[pyclass(name = "RawExpr", module = "datafusion.expr", subclass)] #[derive(Debug, Clone)] pub struct PyExpr { pub expr: Expr, From 4f457030f171a26d0c4cce4d55cf541519956fcc Mon Sep 17 00:00:00 2001 From: jsai28 <54253219+jsai28@users.noreply.github.com> Date: Sat, 15 Mar 2025 04:57:38 -0600 Subject: [PATCH 112/248] added pytest asyncio tests (#1063) --- pyproject.toml | 1 + python/tests/test_dataframe.py | 54 ++++++++++++++++++++++++++++++++++ uv.lock | 17 ++++++++++- 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 060e3b80a..a4ed18c4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -150,6 +150,7 @@ dev = [ "maturin>=1.8.1", "numpy>1.25.0", "pytest>=7.4.4", + "pytest-asyncio>=0.23.3", "ruff>=0.9.1", "toml>=0.10.2", "pygithub==2.5.0", diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index d084f12dd..384b17878 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -771,6 +771,16 @@ def test_execution_plan(aggregate_df): assert rows_returned == 5 +@pytest.mark.asyncio +async def test_async_iteration_of_df(aggregate_df): + rows_returned = 0 + async for batch in aggregate_df.execute_stream(): + assert batch is not None + rows_returned += len(batch.to_pyarrow()[0]) + + assert rows_returned == 5 + + def test_repartition(df): df.repartition(2) @@ -958,6 +968,18 @@ def test_execute_stream(df): assert not list(stream) # after one iteration the generator must be exhausted +@pytest.mark.asyncio +async def test_execute_stream_async(df): + stream = df.execute_stream() + batches = [batch async for batch in stream] + + assert all(batch is not None for batch in batches) + + # After consuming all batches, the stream should be exhausted + remaining_batches = [batch async for batch in stream] + assert not remaining_batches + + @pytest.mark.parametrize("schema", [True, False]) def test_execute_stream_to_arrow_table(df, schema): stream = df.execute_stream() @@ -974,6 +996,25 @@ def test_execute_stream_to_arrow_table(df, schema): assert set(pyarrow_table.column_names) == {"a", "b", "c"} +@pytest.mark.asyncio +@pytest.mark.parametrize("schema", [True, False]) +async def test_execute_stream_to_arrow_table_async(df, schema): + stream = df.execute_stream() + + if schema: + pyarrow_table = pa.Table.from_batches( + [batch.to_pyarrow() async for batch in stream], schema=df.schema() + ) + else: + pyarrow_table = pa.Table.from_batches( + [batch.to_pyarrow() async for batch in stream] + ) + + assert isinstance(pyarrow_table, pa.Table) + assert pyarrow_table.shape == (3, 3) + assert set(pyarrow_table.column_names) == {"a", "b", "c"} + + def test_execute_stream_partitioned(df): streams = df.execute_stream_partitioned() assert all(batch is not None for stream in streams for batch in stream) @@ -982,6 +1023,19 @@ def test_execute_stream_partitioned(df): ) # after one iteration all generators must be exhausted +@pytest.mark.asyncio +async def test_execute_stream_partitioned_async(df): + streams = df.execute_stream_partitioned() + + for stream in streams: + batches = [batch async for batch in stream] + assert all(batch is not None for batch in batches) + + # Ensure the stream is exhausted after iteration + remaining_batches = [batch async for batch in stream] + assert not remaining_batches + + def test_empty_to_arrow_table(df): # Convert empty datafusion dataframe to pyarrow Table pyarrow_table = df.limit(0).to_arrow_table() diff --git a/uv.lock b/uv.lock index 619b92856..7e4bc4c6b 100644 --- a/uv.lock +++ b/uv.lock @@ -284,9 +284,11 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "maturin" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "pygithub" }, { name = "pytest" }, + { name = "pytest-asyncio" }, { name = "ruff" }, { name = "toml" }, ] @@ -314,9 +316,10 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ { name = "maturin", specifier = ">=1.8.1" }, - { name = "numpy", marker = "python_full_version >= '3.10'", specifier = ">1.24.4" }, + { name = "numpy", specifier = ">1.25.0" }, { name = "pygithub", specifier = "==2.5.0" }, { name = "pytest", specifier = ">=7.4.4" }, + { name = "pytest-asyncio", specifier = ">=0.23.3" }, { name = "ruff", specifier = ">=0.9.1" }, { name = "toml", specifier = ">=0.10.2" }, ] @@ -1079,6 +1082,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/11/92/76a1c94d3afee238333bc0a42b82935dd8f9cf8ce9e336ff87ee14d9e1cf/pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6", size = 343083 }, ] +[[package]] +name = "pytest-asyncio" +version = "0.25.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f2/a8/ecbc8ede70921dd2f544ab1cadd3ff3bf842af27f87bbdea774c7baa1d38/pytest_asyncio-0.25.3.tar.gz", hash = "sha256:fc1da2cf9f125ada7e710b4ddad05518d4cee187ae9412e9ac9271003497f07a", size = 54239 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/17/3493c5624e48fd97156ebaec380dcaafee9506d7e2c46218ceebbb57d7de/pytest_asyncio-0.25.3-py3-none-any.whl", hash = "sha256:9e89518e0f9bd08928f97a3482fdc4e244df17529460bc038291ccaf8f85c7c3", size = 19467 }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" From 2f52688d76e84794343c17ffaf3002534ecfd716 Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 15 Mar 2025 19:00:50 +0800 Subject: [PATCH 113/248] Add decorator for udwf (#1061) * feat: Introduce create_udwf method for User-Defined Window Functions - Added `create_udwf` static method to `WindowUDF` class, allowing users to create User-Defined Window Functions (UDWF) as both a function and a decorator. - Updated type hinting for `_R` using `TypeAlias` for better clarity. - Enhanced documentation with usage examples for both function and decorator styles, improving usability and understanding. * refactor: Simplify UDWF test suite and introduce SimpleWindowCount evaluator - Removed multiple exponential smoothing classes to streamline the code. - Introduced SimpleWindowCount class for basic row counting functionality. - Updated test cases to validate the new SimpleWindowCount evaluator. - Refactored fixture and test functions for clarity and consistency. - Enhanced error handling in UDWF creation tests. * fix: Update type alias import to use typing_extensions for compatibility * Add udwf tests for multiple input types and decorator syntax * replace old def udwf * refactor: Simplify df fixture by passing ctx as an argument * refactor: Rename DataFrame fixtures and update test functions - Renamed `df` fixture to `complex_window_df` for clarity. - Renamed `simple_df` fixture to `count_window_df` to better reflect its purpose. - Updated test functions to use the new fixture names, enhancing readability and maintainability. * refactor: Update udwf calls in WindowUDF to use BiasedNumbers directly - Changed udwf1 to use BiasedNumbers instead of bias_10. - Added udwf2 to call udwf with bias_10. - Introduced udwf3 to demonstrate a lambda function returning BiasedNumbers(20). * feat: Add overloads for udwf function to support multiple input types and decorator syntax * refactor: Simplify udwf method signature by removing redundant type hints * refactor: Remove state_type from udwf method signature and update return type handling - Eliminated the state_type parameter from the udwf method to simplify the function signature. - Updated return type handling in the _function and _decorator methods to use a generic type _R for better type flexibility. - Enhanced the decorator to wrap the original function, allowing for improved argument handling and expression return. * refactor: Update volatility parameter type in udwf method signature to support Volatility enum * Fix ruff errors * fix C901 for def udwf * refactor: Update udwf method signature and simplify input handling - Changed the type hint for the return type in the _create_window_udf_decorator method to use pa.DataType directly instead of a TypeVar. - Simplified the handling of input types by removing redundant checks and directly using the input types list. - Removed unnecessary comments and cleaned up the code for better readability. - Updated the test for udwf to use parameterized tests for better coverage and maintainability. * refactor: Rename input_type to input_types in udwf method signature for clarity * refactor: Enhance typing in udf.py by introducing Protocol for WindowEvaluator and improving import organization * Revert "refactor: Enhance typing in udf.py by introducing Protocol for WindowEvaluator and improving import organization" This reverts commit 16dbe5f3fd88f42d0a304384b162009bd9e49a35. --- python/datafusion/udf.py | 123 +++++++++++++++++++++------ python/tests/test_udwf.py | 170 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 264 insertions(+), 29 deletions(-) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 603b7063d..e93a34ca5 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -621,6 +621,16 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udwf.__call__(*args_raw)) + @overload + @staticmethod + def udwf( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., WindowUDF]: ... + + @overload @staticmethod def udwf( func: Callable[[], WindowEvaluator], @@ -628,24 +638,31 @@ def udwf( return_type: pa.DataType, volatility: Volatility | str, name: Optional[str] = None, - ) -> WindowUDF: - """Create a new User-Defined Window Function. + ) -> WindowUDF: ... - If your :py:class:`WindowEvaluator` can be instantiated with no arguments, you - can simply pass it's type as ``func``. If you need to pass additional arguments - to it's constructor, you can define a lambda or a factory method. During runtime - the :py:class:`WindowEvaluator` will be constructed for every instance in - which this UDWF is used. The following examples are all valid. + @staticmethod + def udwf(*args: Any, **kwargs: Any): # noqa: D417 + """Create a new User-Defined Window Function (UDWF). - .. code-block:: python + This class can be used both as a **function** and as a **decorator**. + + Usage: + - **As a function**: Call `udwf(func, input_types, return_type, volatility, + name)`. + - **As a decorator**: Use `@udwf(input_types, return_type, volatility, + name)`. When using `udwf` as a decorator, **do not pass `func` + explicitly**. + **Function example:** + ``` import pyarrow as pa class BiasedNumbers(WindowEvaluator): def __init__(self, start: int = 0) -> None: self.start = start - def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + def evaluate_all(self, values: list[pa.Array], + num_rows: int) -> pa.Array: return pa.array([self.start + i for i in range(num_rows)]) def bias_10() -> BiasedNumbers: @@ -655,35 +672,93 @@ def bias_10() -> BiasedNumbers: udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable") udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable") + ``` + + **Decorator example:** + ``` + @udwf(pa.int64(), pa.int64(), "immutable") + def biased_numbers() -> BiasedNumbers: + return BiasedNumbers(10) + ``` + Args: - func: A callable to create the window function. - input_types: The data types of the arguments to ``func``. + func: **Only needed when calling as a function. Skip this argument when + using `udwf` as a decorator.** + input_types: The data types of the arguments. return_type: The data type of the return value. volatility: See :py:class:`Volatility` for allowed values. - arguments: A list of arguments to pass in to the __init__ method for accum. name: A descriptive name for the function. Returns: - A user-defined window function. - """ # noqa: W505, E501 + A user-defined window function that can be used in window function calls. + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return WindowUDF._create_window_udf(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return WindowUDF._create_window_udf_decorator(*args, **kwargs) + + @staticmethod + def _create_window_udf( + func: Callable[[], WindowEvaluator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> WindowUDF: + """Create a WindowUDF instance from function arguments.""" if not callable(func): msg = "`func` must be callable." raise TypeError(msg) if not isinstance(func(), WindowEvaluator): msg = "`func` must implement the abstract base class WindowEvaluator" raise TypeError(msg) - if name is None: - name = func().__class__.__qualname__.lower() - if isinstance(input_types, pa.DataType): - input_types = [input_types] - return WindowUDF( - name=name, - func=func, - input_types=input_types, - return_type=return_type, - volatility=volatility, + + name = name or func.__qualname__.lower() + input_types = ( + [input_types] if isinstance(input_types, pa.DataType) else input_types ) + return WindowUDF(name, func, input_types, return_type, volatility) + + @staticmethod + def _get_default_name(func: Callable) -> str: + """Get the default name for a function based on its attributes.""" + if hasattr(func, "__qualname__"): + return func.__qualname__.lower() + return func.__class__.__name__.lower() + + @staticmethod + def _normalize_input_types( + input_types: pa.DataType | list[pa.DataType], + ) -> list[pa.DataType]: + """Convert a single DataType to a list if needed.""" + if isinstance(input_types, pa.DataType): + return [input_types] + return input_types + + @staticmethod + def _create_window_udf_decorator( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[[Callable[[], WindowEvaluator]], Callable[..., Expr]]: + """Create a decorator for a WindowUDF.""" + + def decorator(func: Callable[[], WindowEvaluator]) -> Callable[..., Expr]: + udwf_caller = WindowUDF._create_window_udf( + func, input_types, return_type, volatility, name + ) + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Expr: + return udwf_caller(*args, **kwargs) + + return wrapper + + return decorator + # Convenience exports so we can import instead of treating as # variables at the package root diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 3d6dcf9d8..4190e7d64 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -162,14 +162,27 @@ def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: return pa.array(results) +class SimpleWindowCount(WindowEvaluator): + """A simple window evaluator that counts rows.""" + + def __init__(self, base: int = 0) -> None: + self.base = base + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + return pa.array([self.base + i for i in range(num_rows)]) + + class NotSubclassOfWindowEvaluator: pass @pytest.fixture -def df(): - ctx = SessionContext() +def ctx(): + return SessionContext() + +@pytest.fixture +def complex_window_df(ctx): # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays( [ @@ -182,7 +195,17 @@ def df(): return ctx.create_dataframe([[batch]]) -def test_udwf_errors(df): +@pytest.fixture +def count_window_df(ctx): + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 4, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]], name="test_table") + + +def test_udwf_errors(complex_window_df): with pytest.raises(TypeError): udwf( NotSubclassOfWindowEvaluator, @@ -192,6 +215,103 @@ def test_udwf_errors(df): ) +def test_udwf_errors_with_message(): + """Test error cases for UDWF creation.""" + with pytest.raises( + TypeError, match="`func` must implement the abstract base class WindowEvaluator" + ): + udwf( + NotSubclassOfWindowEvaluator, pa.int64(), pa.int64(), volatility="immutable" + ) + + +def test_udwf_basic_usage(count_window_df): + """Test basic UDWF usage with a simple counting window function.""" + simple_count = udwf( + SimpleWindowCount, pa.int64(), pa.int64(), volatility="immutable" + ) + + df = count_window_df.select( + simple_count(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + +def test_udwf_with_args(count_window_df): + """Test UDWF with constructor arguments.""" + count_base10 = udwf( + lambda: SimpleWindowCount(10), pa.int64(), pa.int64(), volatility="immutable" + ) + + df = count_window_df.select( + count_base10(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([10, 11, 12]) + + +def test_udwf_decorator_basic(count_window_df): + """Test UDWF used as a decorator.""" + + @udwf([pa.int64()], pa.int64(), "immutable") + def window_count() -> WindowEvaluator: + return SimpleWindowCount() + + df = count_window_df.select( + window_count(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + +def test_udwf_decorator_with_args(count_window_df): + """Test UDWF decorator with constructor arguments.""" + + @udwf([pa.int64()], pa.int64(), "immutable") + def window_count_base10() -> WindowEvaluator: + return SimpleWindowCount(10) + + df = count_window_df.select( + window_count_base10(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([10, 11, 12]) + + +def test_register_udwf(ctx, count_window_df): + """Test registering and using UDWF in SQL context.""" + window_count = udwf( + SimpleWindowCount, + [pa.int64()], + pa.int64(), + volatility="immutable", + name="window_count", + ) + + ctx.register_udwf(window_count) + result = ctx.sql( + """ + SELECT window_count(a) + OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED + FOLLOWING) FROM test_table + """ + ).collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + smooth_default = udwf( ExponentialSmoothDefault, pa.float64(), @@ -299,10 +419,50 @@ def test_udwf_errors(df): @pytest.mark.parametrize(("name", "expr", "expected"), data_test_udwf_functions) -def test_udwf_functions(df, name, expr, expected): - df = df.select("a", "b", f.round(expr, lit(3)).alias(name)) +def test_udwf_functions(complex_window_df, name, expr, expected): + df = complex_window_df.select("a", "b", f.round(expr, lit(3)).alias(name)) # execute and collect the first (and only) batch result = df.sort(column("a")).select(column(name)).collect()[0] assert result.column(0) == pa.array(expected) + + +@pytest.mark.parametrize( + "udwf_func", + [ + udwf(SimpleWindowCount, pa.int64(), pa.int64(), "immutable"), + udwf(SimpleWindowCount, [pa.int64()], pa.int64(), "immutable"), + udwf([pa.int64()], pa.int64(), "immutable")(lambda: SimpleWindowCount()), + udwf(pa.int64(), pa.int64(), "immutable")(lambda: SimpleWindowCount()), + ], +) +def test_udwf_overloads(udwf_func, count_window_df): + df = count_window_df.select( + udwf_func(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + +def test_udwf_named_function(ctx, count_window_df): + """Test UDWF with explicit name parameter.""" + window_count = udwf( + SimpleWindowCount, + pa.int64(), + pa.int64(), + volatility="immutable", + name="my_custom_counter", + ) + + ctx.register_udwf(window_count) + result = ctx.sql( + """ + SELECT my_custom_counter(a) + OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED + FOLLOWING) FROM test_table""" + ).collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) From 7c1c08f8617ac97a2568eb0664e9d4ee30fceba9 Mon Sep 17 00:00:00 2001 From: Nirnay Roy <32942494+nirnayroy@users.noreply.github.com> Date: Sat, 15 Mar 2025 17:05:05 +0530 Subject: [PATCH 114/248] feat: expose regex_count function (#1066) * Added wrapper for regex_count function * fix comment --------- Co-authored-by: Nirnay Roy --- python/datafusion/functions.py | 18 ++++++++++++++++++ python/tests/test_functions.py | 4 ++++ src/functions.rs | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 0cc7434cf..26bac149c 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -217,6 +217,7 @@ "random", "range", "rank", + "regexp_count", "regexp_like", "regexp_match", "regexp_replace", @@ -779,6 +780,23 @@ def regexp_replace( return Expr(f.regexp_replace(string.expr, pattern.expr, replacement.expr, flags)) +def regexp_count( + string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None +) -> Expr: + """Returns the number of matches in a string. + + Optional start position (the first position is 1) to search for the regular + expression. + """ + if flags is not None: + flags = flags.expr + if start is not None: + start = start.expr + else: + start = Expr.expr + return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) + + def repeat(string: Expr, n: Expr) -> Expr: """Repeats the ``string`` to ``n`` times.""" return Expr(f.repeat(string.expr, n.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index ed88a16e3..161e1e3bb 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -740,6 +740,10 @@ def test_array_function_obj_tests(stmt, py_expr): f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), pa.array(["H-o", "W-d", "!"]), ), + ( + f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)), + pa.array([1, 1, 0], type=pa.int64()), + ), ], ) def test_string_functions(df, function, expected_result): diff --git a/src/functions.rs b/src/functions.rs index 6a8abb18d..8fac239b4 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -173,6 +173,25 @@ fn regexp_replace( ) .into()) } + +#[pyfunction] +#[pyo3(signature = (string, pattern, start, flags=None))] +/// Returns the number of matches found in the string. +fn regexp_count( + string: PyExpr, + pattern: PyExpr, + start: Option, + flags: Option, +) -> PyResult { + Ok(functions::expr_fn::regexp_count( + string.expr, + pattern.expr, + start.map(|x| x.expr), + flags.map(|x| x.expr), + ) + .into()) +} + /// Creates a new Sort Expr #[pyfunction] fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { @@ -943,6 +962,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(power))?; m.add_wrapped(wrap_pyfunction!(radians))?; m.add_wrapped(wrap_pyfunction!(random))?; + m.add_wrapped(wrap_pyfunction!(regexp_count))?; m.add_wrapped(wrap_pyfunction!(regexp_like))?; m.add_wrapped(wrap_pyfunction!(regexp_match))?; m.add_wrapped(wrap_pyfunction!(regexp_replace))?; From b8dd97bc8eefcfecfa8dcc864c4898c654b236a9 Mon Sep 17 00:00:00 2001 From: Spaarsh <67336892+Spaarsh@users.noreply.github.com> Date: Mon, 17 Mar 2025 20:08:16 +0530 Subject: [PATCH 115/248] Add additional ruff suggestions (#1062) * Enabled ruff rule PT001 and ANN204 * Enabled ruff rule B008 * Enabled ruff rule EM101 * Enabled ruff rule PLR1714 * Enabled ruff rule ANN201 * Enabled ruff rule C400 * Enabled ruff rule B904 * Enabled ruff rule UP006 * Enabled ruff rule RUF012 * Enabled ruff rule FBT003 * Enabled ruff rule C416 * Enabled ruff rule SIM102 * Enabled ruff rule PGH003 * Enabled ruff rule PERF401 * Enabled ruff rule EM102 * Enabled ruff rule SIM108 * Enabled ruff rule ICN001 * Enabled ruff rule ICN001 * implemented reviews * Update pyproject.toml to ignore `SIM102` * Enabled ruff rule PLW2901 * Enabled ruff rule RET503 * Fixed failing ruff tests --- benchmarks/db-benchmark/groupby-datafusion.py | 24 ++-- benchmarks/db-benchmark/join-datafusion.py | 5 +- benchmarks/tpch/tpch.py | 7 +- dev/release/generate-changelog.py | 6 +- docs/source/conf.py | 4 +- examples/create-context.py | 12 +- examples/python-udaf.py | 36 +++-- examples/python-udf-comparisons.py | 9 +- examples/python-udf.py | 12 +- examples/query-pyarrow-data.py | 10 +- examples/sql-using-python-udaf.py | 2 +- examples/tpch/_tests.py | 4 +- examples/tpch/convert_data_to_parquet.py | 134 +++++++++--------- examples/tpch/q08_market_share.py | 2 +- examples/tpch/q19_discounted_revenue.py | 4 +- .../tpch/q21_suppliers_kept_orders_waiting.py | 2 +- pyproject.toml | 20 --- python/datafusion/__init__.py | 8 +- python/datafusion/catalog.py | 4 +- python/datafusion/context.py | 51 +++---- python/datafusion/dataframe.py | 55 +++---- python/datafusion/expr.py | 31 ++-- python/datafusion/functions.py | 9 +- python/tests/test_functions.py | 2 +- python/tests/test_wrapper_coverage.py | 7 +- 25 files changed, 213 insertions(+), 247 deletions(-) diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py index 04bf7a149..f9e8d638b 100644 --- a/benchmarks/db-benchmark/groupby-datafusion.py +++ b/benchmarks/db-benchmark/groupby-datafusion.py @@ -20,7 +20,7 @@ import timeit import datafusion as df -import pyarrow +import pyarrow as pa from datafusion import ( RuntimeEnvBuilder, SessionConfig, @@ -37,7 +37,7 @@ exec(open("./_helpers/helpers.py").read()) -def ans_shape(batches): +def ans_shape(batches) -> tuple[int, int]: rows, cols = 0, 0 for batch in batches: rows += batch.num_rows @@ -48,7 +48,7 @@ def ans_shape(batches): return rows, cols -def execute(df): +def execute(df) -> list: print(df.execution_plan().display_indent()) return df.collect() @@ -68,14 +68,14 @@ def execute(df): src_grp = os.path.join("data", data_name + ".csv") print("loading dataset %s" % src_grp, flush=True) -schema = pyarrow.schema( +schema = pa.schema( [ - ("id4", pyarrow.int32()), - ("id5", pyarrow.int32()), - ("id6", pyarrow.int32()), - ("v1", pyarrow.int32()), - ("v2", pyarrow.int32()), - ("v3", pyarrow.float64()), + ("id4", pa.int32()), + ("id5", pa.int32()), + ("id6", pa.int32()), + ("v1", pa.int32()), + ("v2", pa.int32()), + ("v3", pa.float64()), ] ) @@ -93,8 +93,8 @@ def execute(df): ) config = ( SessionConfig() - .with_repartition_joins(False) - .with_repartition_aggregations(False) + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) .set("datafusion.execution.coalesce_batches", "false") ) ctx = SessionContext(config, runtime) diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py index b45ebf632..039868031 100755 --- a/benchmarks/db-benchmark/join-datafusion.py +++ b/benchmarks/db-benchmark/join-datafusion.py @@ -29,7 +29,7 @@ exec(open("./_helpers/helpers.py").read()) -def ans_shape(batches): +def ans_shape(batches) -> tuple[int, int]: rows, cols = 0, 0 for batch in batches: rows += batch.num_rows @@ -57,7 +57,8 @@ def ans_shape(batches): os.path.join("data", y_data_name[2] + ".csv"), ] if len(src_jn_y) != 3: - raise Exception("Something went wrong in preparing files used for join") + error_msg = "Something went wrong in preparing files used for join" + raise Exception(error_msg) print( "loading datasets " diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index bfb9ac398..2d1bbae5b 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -21,7 +21,7 @@ from datafusion import SessionContext -def bench(data_path, query_path): +def bench(data_path, query_path) -> None: with open("results.csv", "w") as results: # register tables start = time.time() @@ -68,10 +68,7 @@ def bench(data_path, query_path): with open(f"{query_path}/q{query}.sql") as f: text = f.read() tmp = text.split(";") - queries = [] - for str in tmp: - if len(str.strip()) > 0: - queries.append(str.strip()) + queries = [s.strip() for s in tmp if len(s.strip()) > 0] try: start = time.time() diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index e30e2def2..d86736773 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -24,7 +24,7 @@ from github import Github -def print_pulls(repo_name, title, pulls): +def print_pulls(repo_name, title, pulls) -> None: if len(pulls) > 0: print(f"**{title}:**") print() @@ -34,7 +34,7 @@ def print_pulls(repo_name, title, pulls): print() -def generate_changelog(repo, repo_name, tag1, tag2, version): +def generate_changelog(repo, repo_name, tag1, tag2, version) -> None: # get a list of commits between two tags print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr) comparison = repo.compare(tag1, tag2) @@ -154,7 +154,7 @@ def generate_changelog(repo, repo_name, tag1, tag2, version): ) -def cli(args=None): +def cli(args=None) -> None: """Process command line arguments.""" if not args: args = sys.argv[1:] diff --git a/docs/source/conf.py b/docs/source/conf.py index c82a189e0..0be03d81d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -73,7 +73,7 @@ autoapi_python_class_content = "both" -def autoapi_skip_member_fn(app, what, name, obj, skip, options): # noqa: ARG001 +def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa: ARG001 skip_contents = [ # Re-exports ("class", "datafusion.DataFrame"), @@ -93,7 +93,7 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options): # noqa: ARG001 return skip -def setup(sphinx): +def setup(sphinx) -> None: sphinx.connect("autoapi-skip-member", autoapi_skip_member_fn) diff --git a/examples/create-context.py b/examples/create-context.py index 760c8513e..0026d6162 100644 --- a/examples/create-context.py +++ b/examples/create-context.py @@ -25,14 +25,14 @@ runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( SessionConfig() - .with_create_default_catalog_and_schema(True) + .with_create_default_catalog_and_schema(enabled=True) .with_default_catalog_and_schema("foo", "bar") .with_target_partitions(8) - .with_information_schema(True) - .with_repartition_joins(False) - .with_repartition_aggregations(False) - .with_repartition_windows(False) - .with_parquet_pruning(False) + .with_information_schema(enabled=True) + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) + .with_repartition_windows(enabled=False) + .with_parquet_pruning(enabled=False) .set("datafusion.execution.parquet.pushdown_filters", "true") ) ctx = SessionContext(config, runtime) diff --git a/examples/python-udaf.py b/examples/python-udaf.py index 538f69571..6655edb0a 100644 --- a/examples/python-udaf.py +++ b/examples/python-udaf.py @@ -16,7 +16,7 @@ # under the License. import datafusion -import pyarrow +import pyarrow as pa import pyarrow.compute from datafusion import Accumulator, col, udaf @@ -26,25 +26,21 @@ class MyAccumulator(Accumulator): Interface of a user-defined accumulation. """ - def __init__(self): - self._sum = pyarrow.scalar(0.0) + def __init__(self) -> None: + self._sum = pa.scalar(0.0) - def update(self, values: pyarrow.Array) -> None: + def update(self, values: pa.Array) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(values).as_py() - ) + self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(values).as_py()) - def merge(self, states: pyarrow.Array) -> None: + def merge(self, states: pa.Array) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(states).as_py() - ) + self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(states).as_py()) - def state(self) -> pyarrow.Array: - return pyarrow.array([self._sum.as_py()]) + def state(self) -> pa.Array: + return pa.array([self._sum.as_py()]) - def evaluate(self) -> pyarrow.Scalar: + def evaluate(self) -> pa.Scalar: return self._sum @@ -52,17 +48,17 @@ def evaluate(self) -> pyarrow.Scalar: ctx = datafusion.SessionContext() # create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], +batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) my_udaf = udaf( MyAccumulator, - pyarrow.float64(), - pyarrow.float64(), - [pyarrow.float64()], + pa.float64(), + pa.float64(), + [pa.float64()], "stable", ) @@ -70,4 +66,4 @@ def evaluate(self) -> pyarrow.Scalar: result = df.collect()[0] -assert result.column(0) == pyarrow.array([6.0]) +assert result.column(0) == pa.array([6.0]) diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py index c5d5ec8dd..eb0825011 100644 --- a/examples/python-udf-comparisons.py +++ b/examples/python-udf-comparisons.py @@ -112,8 +112,8 @@ def is_of_interest_impl( returnflag_arr: pa.Array, ) -> pa.Array: result = [] - for idx, partkey in enumerate(partkey_arr): - partkey = partkey.as_py() + for idx, partkey_val in enumerate(partkey_arr): + partkey = partkey_val.as_py() suppkey = suppkey_arr[idx].as_py() returnflag = returnflag_arr[idx].as_py() value = (partkey, suppkey, returnflag) @@ -162,10 +162,7 @@ def udf_using_pyarrow_compute_impl( resultant_arr = pc.and_(filtered_partkey_arr, filtered_suppkey_arr) resultant_arr = pc.and_(resultant_arr, filtered_returnflag_arr) - if results is None: - results = resultant_arr - else: - results = pc.or_(results, resultant_arr) + results = resultant_arr if results is None else pc.or_(results, resultant_arr) return results diff --git a/examples/python-udf.py b/examples/python-udf.py index fb2bc253e..1c08acd1a 100644 --- a/examples/python-udf.py +++ b/examples/python-udf.py @@ -15,23 +15,23 @@ # specific language governing permissions and limitations # under the License. -import pyarrow +import pyarrow as pa from datafusion import SessionContext, udf from datafusion import functions as f -def is_null(array: pyarrow.Array) -> pyarrow.Array: +def is_null(array: pa.Array) -> pa.Array: return array.is_null() -is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), "stable") +is_null_arr = udf(is_null, [pa.int64()], pa.bool_(), "stable") # create a context ctx = SessionContext() # create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], +batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) @@ -40,4 +40,4 @@ def is_null(array: pyarrow.Array) -> pyarrow.Array: result = df.collect()[0] -assert result.column(0) == pyarrow.array([False] * 3) +assert result.column(0) == pa.array([False] * 3) diff --git a/examples/query-pyarrow-data.py b/examples/query-pyarrow-data.py index e3456fb5b..9cfe8a62b 100644 --- a/examples/query-pyarrow-data.py +++ b/examples/query-pyarrow-data.py @@ -16,15 +16,15 @@ # under the License. import datafusion -import pyarrow +import pyarrow as pa from datafusion import col # create a context ctx = datafusion.SessionContext() # create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], +batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) @@ -38,5 +38,5 @@ # execute and collect the first (and only) batch result = df.collect()[0] -assert result.column(0) == pyarrow.array([5, 7, 9]) -assert result.column(1) == pyarrow.array([-3, -3, -3]) +assert result.column(0) == pa.array([5, 7, 9]) +assert result.column(1) == pa.array([-3, -3, -3]) diff --git a/examples/sql-using-python-udaf.py b/examples/sql-using-python-udaf.py index 60ab8d134..32ce38900 100644 --- a/examples/sql-using-python-udaf.py +++ b/examples/sql-using-python-udaf.py @@ -25,7 +25,7 @@ class MyAccumulator(Accumulator): Interface of a user-defined accumulation. """ - def __init__(self): + def __init__(self) -> None: self._sum = pa.scalar(0.0) def update(self, values: pa.Array) -> None: diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index 2be4dfabd..80ff80244 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -91,7 +91,7 @@ def check_q17(df): ("q22_global_sales_opportunity", "q22"), ], ) -def test_tpch_query_vs_answer_file(query_code: str, answer_file: str): +def test_tpch_query_vs_answer_file(query_code: str, answer_file: str) -> None: module = import_module(query_code) df: DataFrame = module.df @@ -122,3 +122,5 @@ def test_tpch_query_vs_answer_file(query_code: str, answer_file: str): assert df.join(df_expected, on=cols, how="anti").count() == 0 assert df.count() == df_expected.count() + + return None diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py index 73097fac5..fd0fcca49 100644 --- a/examples/tpch/convert_data_to_parquet.py +++ b/examples/tpch/convert_data_to_parquet.py @@ -25,112 +25,112 @@ import os import datafusion -import pyarrow +import pyarrow as pa ctx = datafusion.SessionContext() all_schemas = {} all_schemas["customer"] = [ - ("C_CUSTKEY", pyarrow.int64()), - ("C_NAME", pyarrow.string()), - ("C_ADDRESS", pyarrow.string()), - ("C_NATIONKEY", pyarrow.int64()), - ("C_PHONE", pyarrow.string()), - ("C_ACCTBAL", pyarrow.decimal128(15, 2)), - ("C_MKTSEGMENT", pyarrow.string()), - ("C_COMMENT", pyarrow.string()), + ("C_CUSTKEY", pa.int64()), + ("C_NAME", pa.string()), + ("C_ADDRESS", pa.string()), + ("C_NATIONKEY", pa.int64()), + ("C_PHONE", pa.string()), + ("C_ACCTBAL", pa.decimal128(15, 2)), + ("C_MKTSEGMENT", pa.string()), + ("C_COMMENT", pa.string()), ] all_schemas["lineitem"] = [ - ("L_ORDERKEY", pyarrow.int64()), - ("L_PARTKEY", pyarrow.int64()), - ("L_SUPPKEY", pyarrow.int64()), - ("L_LINENUMBER", pyarrow.int32()), - ("L_QUANTITY", pyarrow.decimal128(15, 2)), - ("L_EXTENDEDPRICE", pyarrow.decimal128(15, 2)), - ("L_DISCOUNT", pyarrow.decimal128(15, 2)), - ("L_TAX", pyarrow.decimal128(15, 2)), - ("L_RETURNFLAG", pyarrow.string()), - ("L_LINESTATUS", pyarrow.string()), - ("L_SHIPDATE", pyarrow.date32()), - ("L_COMMITDATE", pyarrow.date32()), - ("L_RECEIPTDATE", pyarrow.date32()), - ("L_SHIPINSTRUCT", pyarrow.string()), - ("L_SHIPMODE", pyarrow.string()), - ("L_COMMENT", pyarrow.string()), + ("L_ORDERKEY", pa.int64()), + ("L_PARTKEY", pa.int64()), + ("L_SUPPKEY", pa.int64()), + ("L_LINENUMBER", pa.int32()), + ("L_QUANTITY", pa.decimal128(15, 2)), + ("L_EXTENDEDPRICE", pa.decimal128(15, 2)), + ("L_DISCOUNT", pa.decimal128(15, 2)), + ("L_TAX", pa.decimal128(15, 2)), + ("L_RETURNFLAG", pa.string()), + ("L_LINESTATUS", pa.string()), + ("L_SHIPDATE", pa.date32()), + ("L_COMMITDATE", pa.date32()), + ("L_RECEIPTDATE", pa.date32()), + ("L_SHIPINSTRUCT", pa.string()), + ("L_SHIPMODE", pa.string()), + ("L_COMMENT", pa.string()), ] all_schemas["nation"] = [ - ("N_NATIONKEY", pyarrow.int64()), - ("N_NAME", pyarrow.string()), - ("N_REGIONKEY", pyarrow.int64()), - ("N_COMMENT", pyarrow.string()), + ("N_NATIONKEY", pa.int64()), + ("N_NAME", pa.string()), + ("N_REGIONKEY", pa.int64()), + ("N_COMMENT", pa.string()), ] all_schemas["orders"] = [ - ("O_ORDERKEY", pyarrow.int64()), - ("O_CUSTKEY", pyarrow.int64()), - ("O_ORDERSTATUS", pyarrow.string()), - ("O_TOTALPRICE", pyarrow.decimal128(15, 2)), - ("O_ORDERDATE", pyarrow.date32()), - ("O_ORDERPRIORITY", pyarrow.string()), - ("O_CLERK", pyarrow.string()), - ("O_SHIPPRIORITY", pyarrow.int32()), - ("O_COMMENT", pyarrow.string()), + ("O_ORDERKEY", pa.int64()), + ("O_CUSTKEY", pa.int64()), + ("O_ORDERSTATUS", pa.string()), + ("O_TOTALPRICE", pa.decimal128(15, 2)), + ("O_ORDERDATE", pa.date32()), + ("O_ORDERPRIORITY", pa.string()), + ("O_CLERK", pa.string()), + ("O_SHIPPRIORITY", pa.int32()), + ("O_COMMENT", pa.string()), ] all_schemas["part"] = [ - ("P_PARTKEY", pyarrow.int64()), - ("P_NAME", pyarrow.string()), - ("P_MFGR", pyarrow.string()), - ("P_BRAND", pyarrow.string()), - ("P_TYPE", pyarrow.string()), - ("P_SIZE", pyarrow.int32()), - ("P_CONTAINER", pyarrow.string()), - ("P_RETAILPRICE", pyarrow.decimal128(15, 2)), - ("P_COMMENT", pyarrow.string()), + ("P_PARTKEY", pa.int64()), + ("P_NAME", pa.string()), + ("P_MFGR", pa.string()), + ("P_BRAND", pa.string()), + ("P_TYPE", pa.string()), + ("P_SIZE", pa.int32()), + ("P_CONTAINER", pa.string()), + ("P_RETAILPRICE", pa.decimal128(15, 2)), + ("P_COMMENT", pa.string()), ] all_schemas["partsupp"] = [ - ("PS_PARTKEY", pyarrow.int64()), - ("PS_SUPPKEY", pyarrow.int64()), - ("PS_AVAILQTY", pyarrow.int32()), - ("PS_SUPPLYCOST", pyarrow.decimal128(15, 2)), - ("PS_COMMENT", pyarrow.string()), + ("PS_PARTKEY", pa.int64()), + ("PS_SUPPKEY", pa.int64()), + ("PS_AVAILQTY", pa.int32()), + ("PS_SUPPLYCOST", pa.decimal128(15, 2)), + ("PS_COMMENT", pa.string()), ] all_schemas["region"] = [ - ("r_REGIONKEY", pyarrow.int64()), - ("r_NAME", pyarrow.string()), - ("r_COMMENT", pyarrow.string()), + ("r_REGIONKEY", pa.int64()), + ("r_NAME", pa.string()), + ("r_COMMENT", pa.string()), ] all_schemas["supplier"] = [ - ("S_SUPPKEY", pyarrow.int64()), - ("S_NAME", pyarrow.string()), - ("S_ADDRESS", pyarrow.string()), - ("S_NATIONKEY", pyarrow.int32()), - ("S_PHONE", pyarrow.string()), - ("S_ACCTBAL", pyarrow.decimal128(15, 2)), - ("S_COMMENT", pyarrow.string()), + ("S_SUPPKEY", pa.int64()), + ("S_NAME", pa.string()), + ("S_ADDRESS", pa.string()), + ("S_NATIONKEY", pa.int32()), + ("S_PHONE", pa.string()), + ("S_ACCTBAL", pa.decimal128(15, 2)), + ("S_COMMENT", pa.string()), ] curr_dir = os.path.dirname(os.path.abspath(__file__)) -for filename, curr_schema in all_schemas.items(): +for filename, curr_schema_val in all_schemas.items(): # For convenience, go ahead and convert the schema column names to lowercase - curr_schema = [(s[0].lower(), s[1]) for s in curr_schema] + curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val] # Pre-collect the output columns so we can ignore the null field we add # in to handle the trailing | in the file output_cols = [r[0] for r in curr_schema] - curr_schema = [pyarrow.field(r[0], r[1], nullable=False) for r in curr_schema] + curr_schema = [pa.field(r[0], r[1], nullable=False) for r in curr_schema] # Trailing | requires extra field for in processing - curr_schema.append(("some_null", pyarrow.null())) + curr_schema.append(("some_null", pa.null())) - schema = pyarrow.schema(curr_schema) + schema = pa.schema(curr_schema) source_file = os.path.abspath( os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv") diff --git a/examples/tpch/q08_market_share.py b/examples/tpch/q08_market_share.py index d46df30f2..4bf50efba 100644 --- a/examples/tpch/q08_market_share.py +++ b/examples/tpch/q08_market_share.py @@ -150,7 +150,7 @@ df = df.with_column( "national_volume", F.case(col("s_suppkey").is_null()) - .when(lit(False), col("volume")) + .when(lit(value=False), col("volume")) .otherwise(lit(0.0)), ) diff --git a/examples/tpch/q19_discounted_revenue.py b/examples/tpch/q19_discounted_revenue.py index 2b87e1120..bd492aac0 100644 --- a/examples/tpch/q19_discounted_revenue.py +++ b/examples/tpch/q19_discounted_revenue.py @@ -89,8 +89,8 @@ def is_of_interest( same number of rows in the output. """ result = [] - for idx, brand in enumerate(brand_arr): - brand = brand.as_py() + for idx, brand_val in enumerate(brand_arr): + brand = brand_val.as_py() if brand in items_of_interest: values_of_interest = items_of_interest[brand] diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py b/examples/tpch/q21_suppliers_kept_orders_waiting.py index 9bbaad779..619c4406b 100644 --- a/examples/tpch/q21_suppliers_kept_orders_waiting.py +++ b/examples/tpch/q21_suppliers_kept_orders_waiting.py @@ -65,7 +65,7 @@ df = df.with_column( "failed_supp", F.case(col("l_receiptdate") > col("l_commitdate")) - .when(lit(True), col("l_suppkey")) + .when(lit(value=True), col("l_suppkey")) .end(), ) diff --git a/pyproject.toml b/pyproject.toml index a4ed18c4c..d86b657ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,37 +80,17 @@ ignore = [ "TD003", # Allow TODO lines "UP007", # Disallowing Union is pedantic # TODO: Enable all of the following, but this PR is getting too large already - "PT001", - "ANN204", - "B008", - "EM101", "PLR0913", - "PLR1714", - "ANN201", - "C400", "TRY003", - "B904", - "UP006", - "RUF012", - "FBT003", - "C416", - "SIM102", - "PGH003", "PLR2004", - "PERF401", "PD901", - "EM102", "ERA001", - "SIM108", - "ICN001", "ANN001", "ANN202", "PTH", "N812", "INP001", "DTZ007", - "PLW2901", - "RET503", "RUF015", "A005", "TC001", diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 286e5dc31..d871fdb71 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -92,17 +92,17 @@ ] -def column(value: str): +def column(value: str) -> Expr: """Create a column expression.""" return Expr.column(value) -def col(value: str): +def col(value: str) -> Expr: """Create a column expression.""" return Expr.column(value) -def literal(value): +def literal(value) -> Expr: """Create a literal expression.""" return Expr.literal(value) @@ -120,6 +120,6 @@ def str_lit(value): return string_literal(value) -def lit(value): +def lit(value) -> Expr: """Create a literal expression.""" return Expr.literal(value) diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 0560f4704..6c3f188cc 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -24,7 +24,7 @@ import datafusion._internal as df_internal if TYPE_CHECKING: - import pyarrow + import pyarrow as pa class Catalog: @@ -67,7 +67,7 @@ def __init__(self, table: df_internal.Table) -> None: self.table = table @property - def schema(self) -> pyarrow.Schema: + def schema(self) -> pa.Schema: """Returns the schema associated with this table.""" return self.table.schema diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 58ad9a943..1429a4975 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -40,9 +40,9 @@ if TYPE_CHECKING: import pathlib - import pandas - import polars - import pyarrow + import pandas as pd + import polars as pl + import pyarrow as pa from datafusion.plan import ExecutionPlan, LogicalPlan @@ -537,7 +537,7 @@ def register_listing_table( path: str | pathlib.Path, table_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".parquet", - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> None: """Register multiple files as a single table. @@ -606,14 +606,14 @@ def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: def create_dataframe( self, - partitions: list[list[pyarrow.RecordBatch]], + partitions: list[list[pa.RecordBatch]], name: str | None = None, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, ) -> DataFrame: """Create and return a dataframe using the provided partitions. Args: - partitions: :py:class:`pyarrow.RecordBatch` partitions to register. + partitions: :py:class:`pa.RecordBatch` partitions to register. name: Resultant dataframe name. schema: Schema for the partitions. @@ -684,16 +684,14 @@ def from_arrow( return DataFrame(self.ctx.from_arrow(data, name)) @deprecated("Use ``from_arrow`` instead.") - def from_arrow_table( - self, data: pyarrow.Table, name: str | None = None - ) -> DataFrame: + def from_arrow_table(self, data: pa.Table, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow table. This is an alias for :py:func:`from_arrow`. """ return self.from_arrow(data, name) - def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFrame: + def from_pandas(self, data: pd.DataFrame, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Pandas DataFrame. Args: @@ -705,7 +703,7 @@ def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFr """ return DataFrame(self.ctx.from_pandas(data, name)) - def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFrame: + def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Polars DataFrame. Args: @@ -719,7 +717,7 @@ def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFr # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 # is the discussion on how we arrived at adding register_view - def register_view(self, name: str, df: DataFrame): + def register_view(self, name: str, df: DataFrame) -> None: """Register a :py:class: `~datafusion.detaframe.DataFrame` as a view. Args: @@ -755,7 +753,7 @@ def register_table_provider( self.ctx.register_table_provider(name, provider) def register_record_batches( - self, name: str, partitions: list[list[pyarrow.RecordBatch]] + self, name: str, partitions: list[list[pa.RecordBatch]] ) -> None: """Register record batches as a table. @@ -776,7 +774,7 @@ def register_parquet( parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[SortExpr]] | None = None, ) -> None: """Register a Parquet file as a table. @@ -817,7 +815,7 @@ def register_csv( self, name: str, path: str | pathlib.Path | list[str | pathlib.Path], - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", schema_infer_max_records: int = 1000, @@ -843,10 +841,7 @@ def register_csv( selected for data input. file_compression_type: File compression type. """ - if isinstance(path, list): - path = [str(p) for p in path] - else: - path = str(path) + path = [str(p) for p in path] if isinstance(path, list) else str(path) self.ctx.register_csv( name, @@ -863,7 +858,7 @@ def register_json( self, name: str, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", table_partition_cols: list[tuple[str, str]] | None = None, @@ -901,7 +896,7 @@ def register_avro( self, name: str, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_extension: str = ".avro", table_partition_cols: list[tuple[str, str]] | None = None, ) -> None: @@ -923,8 +918,8 @@ def register_avro( name, str(path), schema, file_extension, table_partition_cols ) - def register_dataset(self, name: str, dataset: pyarrow.dataset.Dataset) -> None: - """Register a :py:class:`pyarrow.dataset.Dataset` as a table. + def register_dataset(self, name: str, dataset: pa.dataset.Dataset) -> None: + """Register a :py:class:`pa.dataset.Dataset` as a table. Args: name: Name of the table to register. @@ -975,7 +970,7 @@ def session_id(self) -> str: def read_json( self, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", table_partition_cols: list[tuple[str, str]] | None = None, @@ -1012,7 +1007,7 @@ def read_json( def read_csv( self, path: str | pathlib.Path | list[str] | list[pathlib.Path], - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", schema_infer_max_records: int = 1000, @@ -1065,7 +1060,7 @@ def read_parquet( parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> DataFrame: """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. @@ -1110,7 +1105,7 @@ def read_parquet( def read_avro( self, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".avro", ) -> DataFrame: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index d1c71c2bb..26fe8f453 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -26,10 +26,8 @@ TYPE_CHECKING, Any, Iterable, - List, Literal, Optional, - Type, Union, overload, ) @@ -75,7 +73,7 @@ class Compression(Enum): LZ4_RAW = "lz4_raw" @classmethod - def from_str(cls: Type[Compression], value: str) -> Compression: + def from_str(cls: type[Compression], value: str) -> Compression: """Convert a string to a Compression enum value. Args: @@ -89,11 +87,13 @@ def from_str(cls: Type[Compression], value: str) -> Compression: """ try: return cls(value.lower()) - except ValueError: + except ValueError as err: valid_values = str([item.value for item in Compression]) - raise ValueError( - f"{value} is not a valid Compression. Valid values are: {valid_values}" - ) + error_msg = f""" + {value} is not a valid Compression. + Valid values are: {valid_values} + """ + raise ValueError(error_msg) from err def get_default_level(self) -> Optional[int]: """Get the default compression level for the compression type. @@ -132,7 +132,7 @@ def into_view(self) -> pa.Table: """Convert DataFrame as a ViewTable which can be used in register_table.""" return self.df.into_view() - def __getitem__(self, key: str | List[str]) -> DataFrame: + def __getitem__(self, key: str | list[str]) -> DataFrame: """Return a new :py:class`DataFrame` with the specified column or columns. Args: @@ -287,8 +287,7 @@ def _simplify_expression( if isinstance(expr, Expr): expr_list.append(expr.expr) elif isinstance(expr, Iterable): - for inner_expr in expr: - expr_list.append(inner_expr.expr) + expr_list.extend(inner_expr.expr for inner_expr in expr) else: raise NotImplementedError if named_exprs: @@ -513,10 +512,15 @@ def join( # This check is to prevent breaking API changes where users prior to # DF 43.0.0 would pass the join_keys as a positional argument instead # of a keyword argument. - if isinstance(on, tuple) and len(on) == 2: - if isinstance(on[0], list) and isinstance(on[1], list): - join_keys = on # type: ignore - on = None + if ( + isinstance(on, tuple) + and len(on) == 2 + and isinstance(on[0], list) + and isinstance(on[1], list) + ): + # We know this is safe because we've checked the types + join_keys = on # type: ignore[assignment] + on = None if join_keys is not None: warnings.warn( @@ -529,18 +533,17 @@ def join( if on is not None: if left_on is not None or right_on is not None: - raise ValueError( - "`left_on` or `right_on` should not provided with `on`" - ) + error_msg = "`left_on` or `right_on` should not provided with `on`" + raise ValueError(error_msg) left_on = on right_on = on elif left_on is not None or right_on is not None: if left_on is None or right_on is None: - raise ValueError("`left_on` and `right_on` should both be provided.") + error_msg = "`left_on` and `right_on` should both be provided." + raise ValueError(error_msg) else: - raise ValueError( - "either `on` or `left_on` and `right_on` should be provided." - ) + error_msg = "either `on` or `left_on` and `right_on` should be provided." + raise ValueError(error_msg) if isinstance(left_on, str): left_on = [left_on] if isinstance(right_on, str): @@ -726,9 +729,11 @@ def write_parquet( if isinstance(compression, str): compression = Compression.from_str(compression) - if compression in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD}: - if compression_level is None: - compression_level = compression.get_default_level() + if ( + compression in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD} + and compression_level is None + ): + compression_level = compression.get_default_level() self.df.write_parquet(str(path), compression.value, compression_level) @@ -824,7 +829,7 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram Returns: A DataFrame with the columns expanded. """ - columns = [c for c in columns] + columns = list(columns) return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) def __arrow_c_stream__(self, requested_schema: pa.Schema) -> Any: diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 77b6c272d..2697d8143 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -22,7 +22,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Optional, Type +from typing import TYPE_CHECKING, Any, ClassVar, Optional import pyarrow as pa @@ -176,7 +176,7 @@ def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr: """Helper function to return a default Sort if an Expr is provided.""" if isinstance(e, SortExpr): return e.raw_sort - return SortExpr(e, True, True).raw_sort + return SortExpr(e, ascending=True, nulls_first=True).raw_sort def sort_list_to_raw_sort_list( @@ -439,24 +439,21 @@ def fill_null(self, value: Any | Expr | None = None) -> Expr: value = Expr.literal(value) return Expr(functions_internal.nvl(self.expr, value.expr)) - _to_pyarrow_types = { + _to_pyarrow_types: ClassVar[dict[type, pa.DataType]] = { float: pa.float64(), int: pa.int64(), str: pa.string(), bool: pa.bool_(), } - def cast( - self, to: pa.DataType[Any] | Type[float] | Type[int] | Type[str] | Type[bool] - ) -> Expr: + def cast(self, to: pa.DataType[Any] | type[float | int | str | bool]) -> Expr: """Cast to a new data type.""" if not isinstance(to, pa.DataType): try: to = self._to_pyarrow_types[to] - except KeyError: - raise TypeError( - "Expected instance of pyarrow.DataType or builtins.type" - ) + except KeyError as err: + error_msg = "Expected instance of pyarrow.DataType or builtins.type" + raise TypeError(error_msg) from err return Expr(self.expr.cast(to)) @@ -565,9 +562,7 @@ def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ - return ExprFuncBuilder( - self.expr.partition_by(list(e.expr for e in partition_by)) - ) + return ExprFuncBuilder(self.expr.partition_by([e.expr for e in partition_by])) def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: """Set the frame fora window function. @@ -610,7 +605,7 @@ def over(self, window: Window) -> Expr: class ExprFuncBuilder: - def __init__(self, builder: expr_internal.ExprFuncBuilder): + def __init__(self, builder: expr_internal.ExprFuncBuilder) -> None: self.builder = builder def order_by(self, *exprs: Expr) -> ExprFuncBuilder: @@ -638,7 +633,7 @@ def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: """Set partitioning for window functions.""" return ExprFuncBuilder( - self.builder.partition_by(list(e.expr for e in partition_by)) + self.builder.partition_by([e.expr for e in partition_by]) ) def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: @@ -693,11 +688,11 @@ def __init__( """ if not isinstance(start_bound, pa.Scalar) and start_bound is not None: start_bound = pa.scalar(start_bound) - if units == "rows" or units == "groups": + if units in ("rows", "groups"): start_bound = start_bound.cast(pa.uint64()) if not isinstance(end_bound, pa.Scalar) and end_bound is not None: end_bound = pa.scalar(end_bound) - if units == "rows" or units == "groups": + if units in ("rows", "groups"): end_bound = end_bound.cast(pa.uint64()) self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound) @@ -709,7 +704,7 @@ def get_lower_bound(self) -> WindowFrameBound: """Returns starting bound.""" return WindowFrameBound(self.window_frame.get_lower_bound()) - def get_upper_bound(self): + def get_upper_bound(self) -> WindowFrameBound: """Returns end bound.""" return WindowFrameBound(self.window_frame.get_upper_bound()) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 26bac149c..5cf914e16 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -790,10 +790,7 @@ def regexp_count( """ if flags is not None: flags = flags.expr - if start is not None: - start = start.expr - else: - start = Expr.expr + start = start.expr if start is not None else Expr.expr return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) @@ -817,13 +814,15 @@ def right(string: Expr, n: Expr) -> Expr: return Expr(f.right(string.expr, n.expr)) -def round(value: Expr, decimal_places: Expr = Expr.literal(0)) -> Expr: +def round(value: Expr, decimal_places: Expr | None = None) -> Expr: """Round the argument to the nearest integer. If the optional ``decimal_places`` is specified, round to the nearest number of decimal places. You can specify a negative number of decimal places. For example ``round(lit(125.2345), lit(-2))`` would yield a value of ``100.0``. """ + if decimal_places is None: + decimal_places = Expr.literal(0) return Expr(f.round(value.expr, decimal_places.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 161e1e3bb..37f2075f5 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -81,7 +81,7 @@ def test_literal(df): literal("1"), literal("OK"), literal(3.14), - literal(True), + literal(value=True), literal(b"hello world"), ) result = df.collect() diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index a2de2d32b..926a65961 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -28,7 +28,7 @@ from enum import EnumMeta as EnumType -def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 +def missing_exports(internal_obj, wrapped_obj) -> None: """ Identify if any of the rust exposted structs or functions do not have wrappers. @@ -56,9 +56,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 # __kwdefaults__ and __doc__. As long as these are None on the internal # object, it's okay to skip them. However if they do exist on the internal # object they must also exist on the wrapped object. - if internal_attr is not None: - if wrapped_attr is None: - pytest.fail(f"Missing attribute: {internal_attr_name}") + if internal_attr is not None and wrapped_attr is None: + pytest.fail(f"Missing attribute: {internal_attr_name}") if internal_attr_name in ["__self__", "__class__"]: continue From 42982dad27ad03e7e9395d4c3ae3064c2b489434 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 22 Mar 2025 10:14:55 -0400 Subject: [PATCH 116/248] Improve collection during repr and repr_html (#1036) * Improve table readout of a dataframe in jupyter notebooks by making the table scrollable and displaying the first record batch up to 2MB * Add option to only display a portion of a cell data and the user can click on a button to toggle showing more or less * We cannot expect that the first non-empy batch is sufficient for our 2MB limit, so switch over to collecting until we run out or use up the size * Update python unit test to allow the additional formatting data to exist and only check the table contents * Combining collection for repr and repr_html into one function * Small clippy suggestion * Collect was occuring twice on repr * Switch to execute_stream_partitioned --- python/tests/test_dataframe.py | 23 ++-- src/dataframe.rs | 240 ++++++++++++++++++++++++++++----- src/utils.rs | 2 +- 3 files changed, 225 insertions(+), 40 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 384b17878..718ebf69d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. import os +import re from typing import Any import pyarrow as pa @@ -1245,13 +1246,17 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: def test_dataframe_repr_html(df) -> None: output = df._repr_html_() - ref_html = """
- - - - -
abc
148
255
368
- """ + # Since we've added a fair bit of processing to the html output, lets just verify + # the values we are expecting in the table exist. Use regex and ignore everything + # between the and . We also don't want the closing > on the + # td and th segments because that is where the formatting data is written. - # Ignore whitespace just to make this test look cleaner - assert output.replace(" ", "") == ref_html.replace(" ", "") + headers = ["a", "b", "c"] + headers = [f"{v}" for v in headers] + header_pattern = "(.*?)".join(headers) + assert len(re.findall(header_pattern, output, re.DOTALL)) == 1 + + body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]] + body_lines = [f"{v}" for inner in body_data for v in inner] + body_pattern = "(.*?)".join(body_lines) + assert len(re.findall(body_pattern, output, re.DOTALL)) == 1 diff --git a/src/dataframe.rs b/src/dataframe.rs index 243e2e14f..be10b8c28 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -31,9 +31,11 @@ use datafusion::common::UnnestOptions; use datafusion::config::{CsvOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::TableProvider; +use datafusion::error::DataFusionError; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; +use futures::{StreamExt, TryStreamExt}; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; @@ -70,6 +72,9 @@ impl PyTableProvider { PyTable::new(table_provider) } } +const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB +const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20; +const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25; /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -111,56 +116,151 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { - let df = self.df.as_ref().clone().limit(0, Some(10))?; - let batches = wait_for_future(py, df.collect())?; - let batches_as_string = pretty::pretty_format_batches(&batches); - match batches_as_string { - Ok(batch) => Ok(format!("DataFrame()\n{batch}")), - Err(err) => Ok(format!("Error: {:?}", err.to_string())), + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), + )?; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); } - } - fn _repr_html_(&self, py: Python) -> PyDataFusionResult { - let mut html_str = "\n".to_string(); + let batches_as_displ = + pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?; + + let additional_str = match has_more { + true => "\nData truncated.", + false => "", + }; - let df = self.df.as_ref().clone().limit(0, Some(10))?; - let batches = wait_for_future(py, df.collect())?; + Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}")) + } + fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display( + self.df.as_ref().clone(), + MIN_TABLE_ROWS_TO_DISPLAY, + usize::MAX, + ), + )?; if batches.is_empty() { - html_str.push_str("
\n"); - return Ok(html_str); + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); } + let table_uuid = uuid::Uuid::new_v4().to_string(); + + let mut html_str = " + + +

+ + \n".to_string(); + let schema = batches[0].schema(); let mut header = Vec::new(); for field in schema.fields() { - header.push(format!("", field.name())); } let header_str = header.join(""); - html_str.push_str(&format!("{}\n", header_str)); - - for batch in batches { - let formatters = batch - .columns() - .iter() - .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) - .map(|c| { - c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) - }) - .collect::, _>>()?; - - for row in 0..batch.num_rows() { + html_str.push_str(&format!("{}\n", header_str)); + + let batch_formatters = batches + .iter() + .map(|batch| { + batch + .columns() + .iter() + .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) + .map(|c| { + c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) + }) + .collect::, _>>() + }) + .collect::, _>>()?; + + let rows_per_batch = batches.iter().map(|batch| batch.num_rows()); + + // We need to build up row by row for html + let mut table_row = 0; + for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) { + for batch_row in 0..num_rows_in_batch { + table_row += 1; let mut cells = Vec::new(); - for formatter in &formatters { - cells.push(format!("", formatter.value(row))); + for (col, formatter) in batch_formatter.iter().enumerate() { + let cell_data = formatter.value(batch_row).to_string(); + // From testing, primitive data types do not typically get larger than 21 characters + if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE { + let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE]; + cells.push(format!(" + ")); + } else { + cells.push(format!("", formatter.value(batch_row))); + } } let row_str = cells.join(""); html_str.push_str(&format!("{}\n", row_str)); } } + html_str.push_str("
{}", field.name())); + header.push(format!("{}
{} +
+ {short_cell_data} + {cell_data} + +
+
{}
\n"); + + html_str.push_str(" + + "); - html_str.push_str("\n"); + if has_more { + html_str.push_str("Data truncated due to size."); + } Ok(html_str) } @@ -771,3 +871,83 @@ fn record_batch_into_schema( RecordBatch::try_new(schema, data_arrays) } + +/// This is a helper function to return the first non-empty record batch from executing a DataFrame. +/// It additionally returns a bool, which indicates if there are more record batches available. +/// We do this so we can determine if we should indicate to the user that the data has been +/// truncated. This collects until we have achived both of these two conditions +/// +/// - We have collected our minimum number of rows +/// - We have reached our limit, either data size or maximum number of rows +/// +/// Otherwise it will return when the stream has exhausted. If you want a specific number of +/// rows, set min_rows == max_rows. +async fn collect_record_batches_to_display( + df: DataFrame, + min_rows: usize, + max_rows: usize, +) -> Result<(Vec, bool), DataFusionError> { + let partitioned_stream = df.execute_stream_partitioned().await?; + let mut stream = futures::stream::iter(partitioned_stream).flatten(); + let mut size_estimate_so_far = 0; + let mut rows_so_far = 0; + let mut record_batches = Vec::default(); + let mut has_more = false; + + while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows) + || rows_so_far < min_rows + { + let mut rb = match stream.next().await { + None => { + break; + } + Some(Ok(r)) => r, + Some(Err(e)) => return Err(e), + }; + + let mut rows_in_rb = rb.num_rows(); + if rows_in_rb > 0 { + size_estimate_so_far += rb.get_array_memory_size(); + + if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY { + let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32; + let total_rows = rows_in_rb + rows_so_far; + + let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; + if reduced_row_num < min_rows { + reduced_row_num = min_rows.min(total_rows); + } + + let limited_rows_this_rb = reduced_row_num - rows_so_far; + if limited_rows_this_rb < rows_in_rb { + rows_in_rb = limited_rows_this_rb; + rb = rb.slice(0, limited_rows_this_rb); + has_more = true; + } + } + + if rows_in_rb + rows_so_far > max_rows { + rb = rb.slice(0, max_rows - rows_so_far); + has_more = true; + } + + rows_so_far += rb.num_rows(); + record_batches.push(rb); + } + } + + if record_batches.is_empty() { + return Ok((Vec::default(), false)); + } + + if !has_more { + // Data was not already truncated, so check to see if more record batches remain + has_more = match stream.try_next().await { + Ok(None) => false, // reached end + Ok(Some(_)) => true, + Err(_) => false, // Stream disconnected + }; + } + + Ok((record_batches, has_more)) +} diff --git a/src/utils.rs b/src/utils.rs index 999aad755..3487de21b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -42,7 +42,7 @@ pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { #[inline] pub(crate) fn get_global_ctx() -> &'static SessionContext { static CTX: OnceLock = OnceLock::new(); - CTX.get_or_init(|| SessionContext::new()) + CTX.get_or_init(SessionContext::new) } /// Utility to collect rust futures with GIL released From d0315ffa704aba467f769f444208b7ce26d83037 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 22 Mar 2025 14:37:24 -0400 Subject: [PATCH 117/248] feat: Update DataFusion dependency to 46 (#1079) * Update DataFusion dependency to 46 * There was an update upstream in the exec but it is not a breaking change and only needs unit test updates --- Cargo.lock | 296 +++++++++++++++++++-------------- Cargo.toml | 18 +- python/tests/test_dataframe.py | 3 +- src/expr.rs | 39 +++-- src/expr/aggregate.rs | 10 +- src/expr/aggregate_expr.rs | 11 +- src/expr/window.rs | 24 ++- src/functions.rs | 34 ++-- 8 files changed, 252 insertions(+), 183 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5c7f2bf3c..3a4915f23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "755b6da235ac356a869393c23668c663720b8749dd6f15e52b6c214b4b964cc7" +checksum = "84ef243634a39fb6e9d1710737e7a5ef96c9bacabd2326859ff889bc9ef755e5" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64656a1e0b13ca766f8440752e9a93e11014eec7b67909986f83ed0ab1fe37b8" +checksum = "8f420c6aef51dad2e4a96ce29c0ec90ad84880bdb60b321c74c652a6be07b93f" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a" +checksum = "24bda5ff6461a4ff9739959b3d57b377f45e3f878f7be1a4f28137c0a8f339fa" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a" +checksum = "bc6ed265c73f134a583d02c3cab5e16afab9446d8048ede8707e31f85fad58a0" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee" +checksum = "01c648572391edcef10e5fd458db70ba27ed6f71bcaee04397d0cfb100b34f8b" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90f12542b8164398fc9ec595ff783c4cf6044daa89622c5a7201be920e4c0d4c" +checksum = "a02fb265a6d8011a7d3ad1a36f25816ad0a3bb04cb8e9fe7929c165b98c0cbcd" dependencies = [ "arrow-array", "arrow-cast", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83" +checksum = "5f2cebf504bb6a92a134a87fff98f01b14fbb3a93ecf7aef90cd0f888c5fffa4" dependencies = [ "arrow-buffer", "arrow-schema", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6" +checksum = "8e6405b287671c88846e7751f7291f717b164911474cabac6d3d8614d5aa7374" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9551d9400532f23a370cabbea1dc5a53c49230397d41f96c4c8eedf306199305" +checksum = "5329bf9e7390cbb6b117ddd4d82e94c5362ea4cab5095697139429f36a38350c" dependencies = [ "arrow-array", "arrow-buffer", @@ -319,16 +319,18 @@ dependencies = [ "half", "indexmap", "lexical-core", + "memchr", "num", "serde", "serde_json", + "simdutf8", ] [[package]] name = "arrow-ord" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c07223476f8219d1ace8cd8d85fa18c4ebd8d945013f25ef5c72e85085ca4ee" +checksum = "e103c13d4b80da28339c1d7aa23dd85bd59f42158acc45d39eeb6770627909ce" dependencies = [ "arrow-array", "arrow-buffer", @@ -339,9 +341,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91b194b38bfd89feabc23e798238989c6648b2506ad639be42ec8eb1658d82c4" +checksum = "170549a11b8534f3097a0619cfe89c42812345dc998bcf81128fc700b84345b8" dependencies = [ "arrow-array", "arrow-buffer", @@ -352,18 +354,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735" +checksum = "a5c53775bba63f319189f366d2b86e9a8889373eb198f07d8544938fc9f8ed9a" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539" +checksum = "0a99003b2eb562b8d9c99dfb672306f15e94b20d3734179d596895703e821dcf" dependencies = [ "ahash", "arrow-array", @@ -375,9 +377,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d44c8eed43be4ead49128370f7131f054839d3d6003e52aebf64322470b8fbd0" +checksum = "90fdb130ee8325f4cd8262e19bb6baa3cbcef2b2573c4bee8c6fda7ea08199d7" dependencies = [ "arrow-array", "arrow-buffer", @@ -535,9 +537,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.5" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" +checksum = "b17679a8d69b6d7fd9cd9801a536cec9fa5e5970b69f9d4747f70b39b031f5e7" dependencies = [ "arrayref", "arrayvec", @@ -649,15 +651,15 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.39" +version = "0.4.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", "serde", - "windows-targets", + "windows-link", ] [[package]] @@ -864,30 +866,32 @@ dependencies = [ [[package]] name = "datafusion" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" +checksum = "914e6f9525599579abbd90b0f7a55afcaaaa40350b9e9ed52563f126dfe45fd3" dependencies = [ "apache-avro", "arrow", - "arrow-array", "arrow-ipc", "arrow-schema", - "async-compression", "async-trait", "bytes", "bzip2 0.5.1", "chrono", "datafusion-catalog", + "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", + "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -896,7 +900,6 @@ dependencies = [ "datafusion-sql", "flate2", "futures", - "glob", "itertools 0.14.0", "log", "num-traits", @@ -908,7 +911,6 @@ dependencies = [ "sqlparser", "tempfile", "tokio", - "tokio-util", "url", "uuid", "xz2", @@ -917,9 +919,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" +checksum = "998a6549e6ee4ee3980e05590b2960446a56b343ea30199ef38acd0e0b9036e2" dependencies = [ "arrow", "async-trait", @@ -933,22 +935,40 @@ dependencies = [ "itertools 0.14.0", "log", "parking_lot", - "sqlparser", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "46.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5ac10096a5b3c0d8a227176c0e543606860842e943594ccddb45cf42a526e43" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "log", + "object_store", + "tokio", ] [[package]] name = "datafusion-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" +checksum = "1f53d7ec508e1b3f68bd301cee3f649834fad51eff9240d898a4b2614cfd0a7a" dependencies = [ "ahash", "apache-avro", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ipc", - "arrow-schema", "base64 0.22.1", "half", "hashbrown 0.14.5", @@ -966,25 +986,59 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" +checksum = "e0fcf41523b22e14cc349b01526e8b9f59206653037f2949a4adbfde5f8cb668" dependencies = [ "log", "tokio", ] +[[package]] +name = "datafusion-datasource" +version = "46.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf7f37ad8b6e88b46c7eeab3236147d32ea64b823544f498455a8d9042839c92" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2 0.5.1", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "rand", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd", +] + [[package]] name = "datafusion-doc" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" +checksum = "7db7a0239fd060f359dc56c6e7db726abaa92babaed2fb2e91c3a8b2fff8b256" [[package]] name = "datafusion-execution" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" +checksum = "0938f9e5b6bc5782be4111cdfb70c02b7b5451bf34fd57e4de062a7f7c4e31f1" dependencies = [ "arrow", "dashmap", @@ -1001,9 +1055,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" +checksum = "b36c28b00b00019a8695ad7f1a53ee1673487b90322ecbd604e2cf32894eb14f" dependencies = [ "arrow", "chrono", @@ -1022,26 +1076,25 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" +checksum = "18f0a851a436c5a2139189eb4617a54e6a9ccb9edc96c4b3c83b3bb7c58b950e" dependencies = [ "arrow", "datafusion-common", + "indexmap", "itertools 0.14.0", "paste", ] [[package]] name = "datafusion-ffi" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff47a79d442207c168c6e3e1d970c248589c148e4800e5b285ac1b2cb1a230f8" +checksum = "d740dd9f32a4f4ed1b907e6934201bb059efe6c877532512c661771d973c7b21" dependencies = [ "abi_stable", "arrow", - "arrow-array", - "arrow-schema", "async-ffi", "async-trait", "datafusion", @@ -1055,9 +1108,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" +checksum = "e3196e37d7b65469fb79fee4f05e5bb58a456831035f9a38aa5919aeb3298d40" dependencies = [ "arrow", "arrow-buffer", @@ -1071,7 +1124,6 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", - "hashbrown 0.14.5", "hex", "itertools 0.14.0", "log", @@ -1085,14 +1137,12 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" +checksum = "adfc2d074d5ee4d9354fdcc9283d5b2b9037849237ddecb8942a29144b77ca05" dependencies = [ "ahash", "arrow", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1108,9 +1158,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" +checksum = "1cbceba0f98d921309a9121b702bcd49289d383684cccabf9a92cda1602f3bbb" dependencies = [ "ahash", "arrow", @@ -1121,15 +1171,12 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" +checksum = "170e27ce4baa27113ddf5f77f1a7ec484b0dbeda0c7abbd4bad3fc609c8ab71a" dependencies = [ "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1145,9 +1192,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" +checksum = "7d3a06a7f0817ded87b026a437e7e51de7f59d48173b0a4e803aa896a7bd6bb5" dependencies = [ "arrow", "async-trait", @@ -1161,9 +1208,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" +checksum = "d6c608b66496a1e05e3d196131eb9bebea579eed1f59e88d962baf3dda853bc6" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1178,9 +1225,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" +checksum = "da2f9d83348957b4ad0cd87b5cb9445f2651863a36592fe5484d43b49a5f8d82" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1188,9 +1235,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" +checksum = "4800e1ff7ecf8f310887e9b54c9c444b8e215ccbc7b21c2f244cfae373b1ece7" dependencies = [ "datafusion-expr", "quote", @@ -1199,9 +1246,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" +checksum = "971c51c54cd309001376fae752fb15a6b41750b6d1552345c46afbfb6458801b" dependencies = [ "arrow", "chrono", @@ -1218,15 +1265,12 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" +checksum = "e1447c2c6bc8674a16be4786b4abf528c302803fafa186aa6275692570e64d85" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1243,13 +1287,12 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" +checksum = "69f8c25dcd069073a75b3d2840a79d0f81e64bdd2c05f2d3d18939afb36a7dcb" dependencies = [ "ahash", "arrow", - "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", @@ -1258,12 +1301,11 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" +checksum = "68da5266b5b9847c11d1b3404ee96b1d423814e1973e1ad3789131e5ec912763" dependencies = [ "arrow", - "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -1271,23 +1313,19 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "futures", "itertools 0.14.0", "log", "recursive", - "url", ] [[package]] name = "datafusion-physical-plan" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" +checksum = "88cc160df00e413e370b3b259c8ea7bfbebc134d32de16325950e9e923846b7f" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", "arrow-schema", "async-trait", @@ -1312,9 +1350,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2db5d79f0c974041787b899d24dc91bdab2ff112d1942dd71356a4ce3b407e6c" +checksum = "6f6ef4c6eb52370cb48639e25e2331a415aac0b2b0a0a472b36e26603bdf184f" dependencies = [ "arrow", "chrono", @@ -1328,9 +1366,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de21bde1603aac0ff32cf478e47081be6e3583c6861fe8f57034da911efe7578" +checksum = "5faf4a9bbb0d0a305fea8a6db21ba863286b53e53a212e687d2774028dd6f03f" dependencies = [ "arrow", "datafusion-common", @@ -1362,13 +1400,11 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" +checksum = "325a212b67b677c0eb91447bf9a11b630f9fc4f62d8e5d145bf859f5a6b29e64" dependencies = [ "arrow", - "arrow-array", - "arrow-schema", "bigdecimal", "datafusion-common", "datafusion-expr", @@ -1381,11 +1417,10 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1634405abd8bd3c64c352f2da2f2aec6d80a815930257e0db0ce4ff5daf00944" +checksum = "2c2be3226a683e02cff65181e66e62eba9f812ed0e9b7ec8fe11ac8dabf1a73f" dependencies = [ - "arrow-buffer", "async-recursion", "async-trait", "chrono", @@ -1395,6 +1430,7 @@ dependencies = [ "pbjson-types", "prost", "substrait", + "tokio", "url", ] @@ -1472,9 +1508,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.35" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" dependencies = [ "crc32fast", "miniz_oxide", @@ -2117,9 +2153,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" [[package]] name = "libflate" @@ -2447,9 +2483,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.1.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" +checksum = "94243778210509a5a5e9e012872127180c155d73a9cd6e2df9243d213e81e100" dependencies = [ "ahash", "arrow-array", @@ -2479,7 +2515,6 @@ dependencies = [ "tokio", "twox-hash", "zstd", - "zstd-sys", ] [[package]] @@ -3401,11 +3436,12 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.53.0" +version = "0.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" dependencies = [ "log", + "recursive", "sqlparser_derive", ] @@ -3466,9 +3502,9 @@ dependencies = [ [[package]] name = "substrait" -version = "0.52.3" +version = "0.53.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5db15789cecbfdf6b1fcf2db807e767c92273bdc407ac057c2194b070c597756" +checksum = "6fac3d70185423235f37b889764e184b81a5af4bb7c95833396ee9bd92577e1b" dependencies = [ "heck", "pbjson", @@ -3922,12 +3958,14 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.13.1" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" +checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" dependencies = [ "getrandom 0.3.1", + "js-sys", "serde", + "wasm-bindgen", ] [[package]] @@ -4114,6 +4152,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-link" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" + [[package]] name = "windows-registry" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 50967a219..8afabdd82 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,24 +34,24 @@ protoc = [ "datafusion-substrait/protoc" ] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } +tokio = { version = "1.43", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} -arrow = { version = "54", features = ["pyarrow"] } -datafusion = { version = "45.0.0", features = ["avro", "unicode_expressions"] } -datafusion-substrait = { version = "45.0.0", optional = true } -datafusion-proto = { version = "45.0.0" } -datafusion-ffi = { version = "45.0.0" } -prost = "0.13" # keep in line with `datafusion-substrait` +arrow = { version = "54.2.1", features = ["pyarrow"] } +datafusion = { version = "46.0.1", features = ["avro", "unicode_expressions"] } +datafusion-substrait = { version = "46.0.1", optional = true } +datafusion-proto = { version = "46.0.1" } +datafusion-ffi = { version = "46.0.1" } +prost = "0.13.1" # keep in line with `datafusion-substrait` uuid = { version = "1.12", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } -async-trait = "0.1" +async-trait = "0.1.73" futures = "0.3" object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] } url = "2" [build-dependencies] -prost-types = "0.13" # keep in line with `datafusion-substrait` +prost-types = "0.13.1" # keep in line with `datafusion-substrait` pyo3-build-config = "0.23" [lib] diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 718ebf69d..eda13930d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -753,7 +753,8 @@ def test_execution_plan(aggregate_df): assert "AggregateExec:" in indent assert "CoalesceBatchesExec:" in indent assert "RepartitionExec:" in indent - assert "CsvExec:" in indent + assert "DataSourceExec:" in indent + assert "file_type=csv" in indent ctx = SessionContext() rows_returned = 0 diff --git a/src/expr.rs b/src/expr.rs index d3c528eb4..561170289 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use datafusion::logical_expr::expr::{AggregateFunctionParams, WindowFunctionParams}; use datafusion::logical_expr::utils::exprlist_to_fields; use datafusion::logical_expr::{ ExprFuncBuilder, ExprFunctionExt, LogicalPlan, WindowFunctionDefinition, @@ -172,6 +173,7 @@ impl PyExpr { Expr::ScalarSubquery(value) => { Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_bound_py_any(py)?) } + #[allow(deprecated)] Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", qualifier, options @@ -332,7 +334,6 @@ impl PyExpr { | Expr::AggregateFunction { .. } | Expr::WindowFunction { .. } | Expr::InList { .. } - | Expr::Wildcard { .. } | Expr::Exists { .. } | Expr::InSubquery { .. } | Expr::GroupingSet(..) @@ -346,6 +347,10 @@ impl PyExpr { | Expr::Unnest(_) | Expr::IsNotUnknown(_) => RexType::Call, Expr::ScalarSubquery(..) => RexType::ScalarSubquery, + #[allow(deprecated)] + Expr::Wildcard { .. } => { + return Err(py_unsupported_variant_err("Expr::Wildcard is unsupported")) + } }) } @@ -394,11 +399,15 @@ impl PyExpr { | Expr::InSubquery(InSubquery { expr, .. }) => Ok(vec![PyExpr::from(*expr.clone())]), // Expr variants containing a collection of Expr(s) for operands - Expr::AggregateFunction(AggregateFunction { args, .. }) + Expr::AggregateFunction(AggregateFunction { + params: AggregateFunctionParams { args, .. }, + .. + }) | Expr::ScalarFunction(ScalarFunction { args, .. }) - | Expr::WindowFunction(WindowFunction { args, .. }) => { - Ok(args.iter().map(|arg| PyExpr::from(arg.clone())).collect()) - } + | Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { args, .. }, + .. + }) => Ok(args.iter().map(|arg| PyExpr::from(arg.clone())).collect()), // Expr(s) that require more specific processing Expr::Case(Case { @@ -465,13 +474,17 @@ impl PyExpr { Expr::GroupingSet(..) | Expr::Unnest(_) | Expr::OuterReferenceColumn(_, _) - | Expr::Wildcard { .. } | Expr::ScalarSubquery(..) | Expr::Placeholder { .. } | Expr::Exists { .. } => Err(py_runtime_err(format!( "Unimplemented Expr type: {}", self.expr ))), + + #[allow(deprecated)] + Expr::Wildcard { .. } => { + Err(py_unsupported_variant_err("Expr::Wildcard is unsupported")) + } } } @@ -575,7 +588,7 @@ impl PyExpr { Expr::AggregateFunction(agg_fn) => { let window_fn = Expr::WindowFunction(WindowFunction::new( WindowFunctionDefinition::AggregateUDF(agg_fn.func.clone()), - agg_fn.args.clone(), + agg_fn.params.args.clone(), )); add_builder_fns_to_window( @@ -663,16 +676,8 @@ impl PyExpr { /// Create a [Field] representing an [Expr], given an input [LogicalPlan] to resolve against pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> PyDataFusionResult> { - match expr { - Expr::Wildcard { .. } => { - // Since * could be any of the valid column names just return the first one - Ok(Arc::new(input_plan.schema().field(0).clone())) - } - _ => { - let fields = exprlist_to_fields(&[expr.clone()], input_plan)?; - Ok(fields[0].1.clone()) - } - } + let fields = exprlist_to_fields(&[expr.clone()], input_plan)?; + Ok(fields[0].1.clone()) } fn _types(expr: &Expr) -> PyResult { match expr { diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index 8fc9da5b0..a99d83d23 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::common::DataFusionError; -use datafusion::logical_expr::expr::{AggregateFunction, Alias}; +use datafusion::logical_expr::expr::{AggregateFunction, AggregateFunctionParams, Alias}; use datafusion::logical_expr::logical_plan::Aggregate; use datafusion::logical_expr::Expr; use pyo3::{prelude::*, IntoPyObjectExt}; @@ -126,9 +126,11 @@ impl PyAggregate { match expr { // TODO: This Alias logic seems to be returning some strange results that we should investigate Expr::Alias(Alias { expr, .. }) => self._aggregation_arguments(expr.as_ref()), - Expr::AggregateFunction(AggregateFunction { func: _, args, .. }) => { - Ok(args.iter().map(|e| PyExpr::from(e.clone())).collect()) - } + Expr::AggregateFunction(AggregateFunction { + func: _, + params: AggregateFunctionParams { args, .. }, + .. + }) => Ok(args.iter().map(|e| PyExpr::from(e.clone())).collect()), _ => Err(py_type_err( "Encountered a non Aggregate type in aggregation_arguments", )), diff --git a/src/expr/aggregate_expr.rs b/src/expr/aggregate_expr.rs index 09471097f..c09f116e3 100644 --- a/src/expr/aggregate_expr.rs +++ b/src/expr/aggregate_expr.rs @@ -40,7 +40,13 @@ impl From for PyAggregateFunction { impl Display for PyAggregateFunction { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let args: Vec = self.aggr.args.iter().map(|expr| expr.to_string()).collect(); + let args: Vec = self + .aggr + .params + .args + .iter() + .map(|expr| expr.to_string()) + .collect(); write!(f, "{}({})", self.aggr.func.name(), args.join(", ")) } } @@ -54,12 +60,13 @@ impl PyAggregateFunction { /// is this a distinct aggregate such as `COUNT(DISTINCT expr)` fn is_distinct(&self) -> bool { - self.aggr.distinct + self.aggr.params.distinct } /// Get the arguments to the aggregate function fn args(&self) -> Vec { self.aggr + .params .args .iter() .map(|expr| PyExpr::from(expr.clone())) diff --git a/src/expr/window.rs b/src/expr/window.rs index 13deaec25..c5467bf94 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::common::{DataFusionError, ScalarValue}; -use datafusion::logical_expr::expr::WindowFunction; +use datafusion::logical_expr::expr::{WindowFunction, WindowFunctionParams}; use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, WindowFrameUnits}; use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; @@ -118,7 +118,10 @@ impl PyWindowExpr { /// Returns order by columns in a window function expression pub fn get_sort_exprs(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { order_by, .. }) => py_sort_expr_list(&order_by), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { order_by, .. }, + .. + }) => py_sort_expr_list(&order_by), other => Err(not_window_function_err(other)), } } @@ -126,9 +129,10 @@ impl PyWindowExpr { /// Return partition by columns in a window function expression pub fn get_partition_exprs(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { partition_by, .. }) => { - py_expr_list(&partition_by) - } + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { partition_by, .. }, + .. + }) => py_expr_list(&partition_by), other => Err(not_window_function_err(other)), } } @@ -136,7 +140,10 @@ impl PyWindowExpr { /// Return input args for window function pub fn get_args(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { args, .. }) => py_expr_list(&args), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { args, .. }, + .. + }) => py_expr_list(&args), other => Err(not_window_function_err(other)), } } @@ -152,7 +159,10 @@ impl PyWindowExpr { /// Returns a Pywindow frame for a given window function expression pub fn get_frame(&self, expr: PyExpr) -> Option { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { window_frame, .. }) => Some(window_frame.into()), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { window_frame, .. }, + .. + }) => Some(window_frame.into()), _ => None, } } diff --git a/src/functions.rs b/src/functions.rs index 8fac239b4..9c406b95a 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -17,6 +17,7 @@ use datafusion::functions_aggregate::all_default_aggregate_functions; use datafusion::functions_window::all_default_window_functions; +use datafusion::logical_expr::expr::WindowFunctionParams; use datafusion::logical_expr::ExprFunctionExt; use datafusion::logical_expr::WindowFrame; use pyo3::{prelude::*, wrap_pyfunction}; @@ -215,10 +216,7 @@ fn alias(expr: PyExpr, name: &str) -> PyResult { #[pyfunction] fn col(name: &str) -> PyResult { Ok(PyExpr { - expr: datafusion::logical_expr::Expr::Column(Column { - relation: None, - name: name.to_string(), - }), + expr: datafusion::logical_expr::Expr::Column(Column::new_unqualified(name)), }) } @@ -333,19 +331,21 @@ fn window( Ok(PyExpr { expr: datafusion::logical_expr::Expr::WindowFunction(WindowFunction { fun, - args: args.into_iter().map(|x| x.expr).collect::>(), - partition_by: partition_by - .unwrap_or_default() - .into_iter() - .map(|x| x.expr) - .collect::>(), - order_by: order_by - .unwrap_or_default() - .into_iter() - .map(|x| x.into()) - .collect::>(), - window_frame, - null_treatment: None, + params: WindowFunctionParams { + args: args.into_iter().map(|x| x.expr).collect::>(), + partition_by: partition_by + .unwrap_or_default() + .into_iter() + .map(|x| x.expr) + .collect::>(), + order_by: order_by + .unwrap_or_default() + .into_iter() + .map(|x| x.into()) + .collect::>(), + window_frame, + null_treatment: None, + }, }), }) } From 583e1e9420906c99b1fbdf57c0138f1e67548008 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 30 Mar 2025 08:44:55 -0400 Subject: [PATCH 118/248] Update changelog and version number (#1089) --- Cargo.lock | 2 +- Cargo.toml | 2 +- dev/changelog/46.0.0.md | 73 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 dev/changelog/46.0.0.md diff --git a/Cargo.lock b/Cargo.lock index 3a4915f23..f90038c50 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1377,7 +1377,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "45.2.0" +version = "46.0.0" dependencies = [ "arrow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 8afabdd82..bc8639d4c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "45.2.0" +version = "46.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] diff --git a/dev/changelog/46.0.0.md b/dev/changelog/46.0.0.md new file mode 100644 index 000000000..3e5768099 --- /dev/null +++ b/dev/changelog/46.0.0.md @@ -0,0 +1,73 @@ + + +# Apache DataFusion Python 46.0.0 Changelog + +This release consists of 21 commits from 11 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: reads using global ctx [#982](https://github.com/apache/datafusion-python/pull/982) (ion-elgreco) +- feat: Implementation of udf and udaf decorator [#1040](https://github.com/apache/datafusion-python/pull/1040) (CrystalZhou0529) +- feat: expose regex_count function [#1066](https://github.com/apache/datafusion-python/pull/1066) (nirnayroy) +- feat: Update DataFusion dependency to 46 [#1079](https://github.com/apache/datafusion-python/pull/1079) (timsaucer) + +**Fixed bugs:** + +- fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky) +- fix: type checking [#993](https://github.com/apache/datafusion-python/pull/993) (chenkovsky) + +**Other:** + +- [infra] Fail Clippy on rust build warnings [#1029](https://github.com/apache/datafusion-python/pull/1029) (kevinjqliu) +- Add user documentation for the FFI approach [#1031](https://github.com/apache/datafusion-python/pull/1031) (timsaucer) +- build(deps): bump arrow from 54.1.0 to 54.2.0 [#1035](https://github.com/apache/datafusion-python/pull/1035) (dependabot[bot]) +- Chore: Release datafusion-python 45 [#1024](https://github.com/apache/datafusion-python/pull/1024) (timsaucer) +- Enable Dataframe to be converted into views which can be used in register_table [#1016](https://github.com/apache/datafusion-python/pull/1016) (kosiew) +- Add ruff check for missing futures import [#1052](https://github.com/apache/datafusion-python/pull/1052) (timsaucer) +- Enable take comments to assign issues to users [#1058](https://github.com/apache/datafusion-python/pull/1058) (timsaucer) +- Update python min version to 3.9 [#1043](https://github.com/apache/datafusion-python/pull/1043) (kevinjqliu) +- feat/improve ruff test coverage [#1055](https://github.com/apache/datafusion-python/pull/1055) (timsaucer) +- feat/making global context accessible for users [#1060](https://github.com/apache/datafusion-python/pull/1060) (jsai28) +- Renaming Internal Structs [#1059](https://github.com/apache/datafusion-python/pull/1059) (Spaarsh) +- test: add pytest asyncio tests [#1063](https://github.com/apache/datafusion-python/pull/1063) (jsai28) +- Add decorator for udwf [#1061](https://github.com/apache/datafusion-python/pull/1061) (kosiew) +- Add additional ruff suggestions [#1062](https://github.com/apache/datafusion-python/pull/1062) (Spaarsh) +- Improve collection during repr and repr_html [#1036](https://github.com/apache/datafusion-python/pull/1036) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Tim Saucer + 2 Kevin Liu + 2 Spaarsh + 2 jsai28 + 2 kosiew + 1 Chen Chongchen + 1 Chongchen Chen + 1 Crystal Zhou + 1 Ion Koutsouris + 1 Nirnay Roy + 1 dependabot[bot] +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + From ffafb59e1b1b7f49f4ba4507b28ba1cecfb0225a Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Sun, 30 Mar 2025 20:45:15 +0800 Subject: [PATCH 119/248] feat: support unparser (#1088) * support unparser * add license * add export * format * format --- python/datafusion/__init__.py | 3 +- python/datafusion/unparser.py | 80 +++++++++++++++++++++++++++++++++++ python/tests/test_unparser.py | 33 +++++++++++++++ src/lib.rs | 5 +++ src/unparser/dialect.rs | 63 +++++++++++++++++++++++++++ src/unparser/mod.rs | 66 +++++++++++++++++++++++++++++ 6 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 python/datafusion/unparser.py create mode 100644 python/tests/test_unparser.py create mode 100644 src/unparser/dialect.rs create mode 100644 src/unparser/mod.rs diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index d871fdb71..ecf5545bc 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -26,7 +26,7 @@ except ImportError: import importlib_metadata -from . import functions, object_store, substrait +from . import functions, object_store, substrait, unparser # The following imports are okay to remain as opaque to the user. from ._internal import Config @@ -89,6 +89,7 @@ "udaf", "udf", "udwf", + "unparser", ] diff --git a/python/datafusion/unparser.py b/python/datafusion/unparser.py new file mode 100644 index 000000000..7ca5b9190 --- /dev/null +++ b/python/datafusion/unparser.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""This module provides support for unparsing datafusion plans to SQL. + +For additional information about unparsing, see https://docs.rs/datafusion-sql/latest/datafusion_sql/unparser/index.html +""" + +from ._internal import unparser as unparser_internal +from .plan import LogicalPlan + + +class Dialect: + """DataFusion data catalog.""" + + def __init__(self, dialect: unparser_internal.Dialect) -> None: + """This constructor is not typically called by the end user.""" + self.dialect = dialect + + @staticmethod + def default() -> "Dialect": + """Create a new default dialect.""" + return Dialect(unparser_internal.Dialect.default()) + + @staticmethod + def mysql() -> "Dialect": + """Create a new MySQL dialect.""" + return Dialect(unparser_internal.Dialect.mysql()) + + @staticmethod + def postgres() -> "Dialect": + """Create a new PostgreSQL dialect.""" + return Dialect(unparser_internal.Dialect.postgres()) + + @staticmethod + def sqlite() -> "Dialect": + """Create a new SQLite dialect.""" + return Dialect(unparser_internal.Dialect.sqlite()) + + @staticmethod + def duckdb() -> "Dialect": + """Create a new DuckDB dialect.""" + return Dialect(unparser_internal.Dialect.duckdb()) + + +class Unparser: + """DataFusion unparser.""" + + def __init__(self, dialect: Dialect) -> None: + """This constructor is not typically called by the end user.""" + self.unparser = unparser_internal.Unparser(dialect.dialect) + + def plan_to_sql(self, plan: LogicalPlan) -> str: + """Convert a logical plan to a SQL string.""" + return self.unparser.plan_to_sql(plan._raw_plan) + + def with_pretty(self, pretty: bool) -> "Unparser": + """Set the pretty flag.""" + self.unparser = self.unparser.with_pretty(pretty) + return self + + +__all__ = [ + "Dialect", + "Unparser", +] diff --git a/python/tests/test_unparser.py b/python/tests/test_unparser.py new file mode 100644 index 000000000..c4e05780c --- /dev/null +++ b/python/tests/test_unparser.py @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from datafusion.context import SessionContext +from datafusion.unparser import Dialect, Unparser + + +def test_unparser(): + ctx = SessionContext() + df = ctx.sql("SELECT 1") + for dialect in [ + Dialect.mysql(), + Dialect.postgres(), + Dialect.sqlite(), + Dialect.duckdb(), + ]: + unparser = Unparser(dialect) + sql = unparser.plan_to_sql(df.logical_plan()) + assert sql == "SELECT 1" diff --git a/src/lib.rs b/src/lib.rs index ce93ff0c3..6eeda0878 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,6 +52,7 @@ pub mod pyarrow_util; mod record_batch; pub mod sql; pub mod store; +pub mod unparser; #[cfg(feature = "substrait")] pub mod substrait; @@ -103,6 +104,10 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { expr::init_module(&expr)?; m.add_submodule(&expr)?; + let unparser = PyModule::new(py, "unparser")?; + unparser::init_module(&unparser)?; + m.add_submodule(&unparser)?; + // Register the functions as a submodule let funcs = PyModule::new(py, "functions")?; functions::init_module(&funcs)?; diff --git a/src/unparser/dialect.rs b/src/unparser/dialect.rs new file mode 100644 index 000000000..caeef9949 --- /dev/null +++ b/src/unparser/dialect.rs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::sql::unparser::dialect::{ + DefaultDialect, Dialect, DuckDBDialect, MySqlDialect, PostgreSqlDialect, SqliteDialect, +}; +use pyo3::prelude::*; + +#[pyclass(name = "Dialect", module = "datafusion.unparser", subclass)] +#[derive(Clone)] +pub struct PyDialect { + pub dialect: Arc, +} + +#[pymethods] +impl PyDialect { + #[staticmethod] + pub fn default() -> Self { + Self { + dialect: Arc::new(DefaultDialect {}), + } + } + #[staticmethod] + pub fn postgres() -> Self { + Self { + dialect: Arc::new(PostgreSqlDialect {}), + } + } + #[staticmethod] + pub fn mysql() -> Self { + Self { + dialect: Arc::new(MySqlDialect {}), + } + } + #[staticmethod] + pub fn sqlite() -> Self { + Self { + dialect: Arc::new(SqliteDialect {}), + } + } + #[staticmethod] + pub fn duckdb() -> Self { + Self { + dialect: Arc::new(DuckDBDialect::new()), + } + } +} diff --git a/src/unparser/mod.rs b/src/unparser/mod.rs new file mode 100644 index 000000000..b4b0fed10 --- /dev/null +++ b/src/unparser/mod.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod dialect; + +use std::sync::Arc; + +use datafusion::sql::unparser::{dialect::Dialect, Unparser}; +use dialect::PyDialect; +use pyo3::{exceptions::PyValueError, prelude::*}; + +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "Unparser", module = "datafusion.unparser", subclass)] +#[derive(Clone)] +pub struct PyUnparser { + dialect: Arc, + pretty: bool, +} + +#[pymethods] +impl PyUnparser { + #[new] + pub fn new(dialect: PyDialect) -> Self { + Self { + dialect: dialect.dialect.clone(), + pretty: false, + } + } + + pub fn plan_to_sql(&self, plan: &PyLogicalPlan) -> PyResult { + let mut unparser = Unparser::new(self.dialect.as_ref()); + unparser = unparser.with_pretty(self.pretty); + let sql = unparser + .plan_to_sql(&plan.plan()) + .map_err(|e| PyValueError::new_err(e.to_string()))?; + Ok(sql.to_string()) + } + + pub fn with_pretty(&self, pretty: bool) -> Self { + Self { + dialect: self.dialect.clone(), + pretty, + } + } +} + +pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + Ok(()) +} From 09b929a65c27ce8c58563d4def8d79b426ae47e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Sch=C3=A4fer?= <33159547+floscha@users.noreply.github.com> Date: Sun, 30 Mar 2025 14:45:49 +0200 Subject: [PATCH 120/248] Documentation updates: mention correct dataset on basics page (#1081) * Documentation updates: mention correct dataset on basics page * Update docs/source/user-guide/basics.rst Co-authored-by: Kevin Liu * Make download hint more concise --------- Co-authored-by: Kevin Liu --- docs/source/user-guide/basics.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index f37378a41..6636c0c6a 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -20,8 +20,8 @@ Concepts ======== -In this section, we will cover a basic example to introduce a few key concepts. We will use the same -source file as described in the :ref:`Introduction `, the Pokemon data set. +In this section, we will cover a basic example to introduce a few key concepts. We will use the +2021 Yellow Taxi Trip Records ([download](https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet)), from the [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). .. ipython:: python From 818975b5c43021fed109ebba3cb99d744e8f036a Mon Sep 17 00:00:00 2001 From: kosiew Date: Mon, 21 Apr 2025 19:51:25 +0800 Subject: [PATCH 121/248] Add Configurable HTML Table Formatter for DataFusion DataFrames in Python (#1100) * feat: add configurable HTML formatter for DataFrames * fix: update schema iteration in DataFrameHtmlFormatter to use correct format * refactor: remove unused constant MAX_LENGTH_CELL_WITHOUT_MINIMIZE in PyTableProvider * refactor: improve HTML rendering structure in DataFrameHtmlFormatter - Added List import to typing for type hints. - Refactored format_html method to modularize HTML component generation. - Created separate methods for building HTML header, table container, header, body, expandable cells, regular cells, and footer for better readability and maintainability. - Updated table_uuid generation to use f-string for consistency. - Ensured all HTML components are returned as lists for efficient joining. * doc: enhance docstrings for DataFrameHtmlFormatter methods to clarify usage * refactor: enhance DataFrameHtmlFormatter with customizable cell and header styles - Added methods `get_cell_style()` and `get_header_style()` to allow subclasses to customize the CSS styles for table cells and headers. - Updated `_build_table_header()` and `_build_regular_cell()` methods to utilize the new styling methods for improved maintainability. - Introduced a registry for custom type formatters in `DataFrameHtmlFormatter` to enable flexible formatting of cell values based on their types. - Enhanced `_format_cell_value()` to check for registered formatters before defaulting to string conversion, improving extensibility. * refactor: enhance DataFrameHtmlFormatter with custom cell and header builders - Introduced CellFormatter and StyleProvider protocols for better extensibility. - Added DefaultStyleProvider class with default CSS styles for cells and headers. - Updated DataFrameHtmlFormatter to support custom cell and header builders. - Refactored methods to utilize the new style provider for consistent styling. - Improved documentation for methods and classes to clarify usage and customization options. * doc: expand module docstring for DataFrameHtmlFormatter with usage examples and customization options * refactor: streamline HTML formatter by removing extensive docstring examples and enhancing cell formatting methods - Removed lengthy examples from the docstring of DataFrameHtmlFormatter to improve readability. - Added methods for extracting and formatting cell values, enhancing the clarity and maintainability of the code. - Updated cell building methods to utilize the new formatting logic, ensuring consistent application of styles and behaviors. - Introduced a reset fixture for tests to ensure the formatter is returned to default settings after each test case. - Added tests for HTML formatter configuration, custom style providers, type formatters, custom cell builders, and complex customizations to ensure robust functionality. * refactor: improve cell rendering logic in DataFrameHtmlFormatter by utilizing raw values for custom cell builders and optimizing expandable cell creation * refactor: enhance HTML representation in DataFrame by integrating latest formatter and improving cell value formatting logic * refactor: improve HTML formatting logic in DataFrame by separating data collection and schema retrieval for clarity refactor: enhance reset_formatter fixture to preserve original formatter configuration during tests * refactor: add debug utilities for HTML formatter integration testing and enhance debugging output in DataFrameHtmlFormatter * refactor: implement HTML formatter patch for DataFrame and enhance value retrieval in cell formatting * fix: correct typo in file extension check for parquet files in test_write_compressed_parquet * test: add test for DataFrame._repr_html_ to validate HTML output structure * refactor: remove monkeypatch for DataFrame._repr_html_ and associated logic * refactor: simplify _repr_html_ method in DataFrame to directly call internal representation * refactor: remove debug utilities for HTML formatter integration in DataFrame * refactor: remove debug print statements from DataFrameHtmlFormatter and add HTML formatter integration tests - Removed debug print statements from format_html, _build_table_body, and get_formatter methods in DataFrameHtmlFormatter to clean up the code. - Introduced a new debug_utils.py file containing a function to check HTML formatter integration. - Updated __init__.py to include configure_formatter for easier access. - Enhanced DataFrame class to include a docstring for _repr_html_ method. - Added comprehensive tests for HTML formatter configuration, custom style providers, type formatters, and cell/header builders in test_dataframe.py. * refactor: streamline imports and enhance HTML formatter integration in tests - Removed redundant import of `configure_formatter` in `__init__.py`. - Added `configure_formatter` to `__all__` in `__init__.py` for better module exposure. - Cleaned up import statements in `html_formatter.py` for clarity. - Consolidated import statements in `test_dataframe.py` for improved readability. - Simplified the `reset_formatter` fixture by removing unnecessary imports and comments. * refactor: remove redundant imports and debug print statements in HTML formatter tests * refactor: add reset_formatter function to reset global HTML formatter state - Implemented reset_formatter to create a new default DataFrame HTML formatter and update the global reference. - Added clean_formatter_state fixture in tests to ensure a fresh formatter state for each test case. - Updated test cases to use clean_formatter_state instead of the previous reset_formatter implementation. * refactor: enhance DataFrameHtmlFormatter initialization with parameter validation * test: add custom cell builder test for HTML formatter with value-based styling * test: enhance DataFrame HTML representation tests for structure and values * feat: enhance DataFrameHtmlFormatter with shared styles support and reset functionality - Added `use_shared_styles` parameter to control loading of styles/scripts. - Implemented logic to conditionally include styles based on `use_shared_styles`. - Updated the constructor to validate `use_shared_styles` as a boolean. - Introduced `reset_styles_loaded_state` function to reset the styles loaded state. - Modified `reset_formatter` to reset the `_styles_loaded` flag. * refactor: update footer comment in DataFrameHtmlFormatter to clarify content * test: enhance HTML representation test to accommodate span-wrapped values * docs: add usage examples to formatter functions in html_formatter.py * test: add HTML formatter tests for shared styles functionality * feat: add method to check if styles are loaded and enhance schema validation in DataFrameHtmlFormatter * refactor: streamline custom cell builder in HTML formatter tests for clarity and maintainability * fix ruff errors * chore: update license header in html_formatter.py for compliance * refactor: improve HTML formatter tests by updating import statements and enhancing regex patterns for body data * fix clippy errors --- python/datafusion/__init__.py | 2 + python/datafusion/html_formatter.py | 647 ++++++++++++++++++++++++++++ python/tests/test_dataframe.py | 396 ++++++++++++++++- src/dataframe.rs | 130 +----- 4 files changed, 1061 insertions(+), 114 deletions(-) create mode 100644 python/datafusion/html_formatter.py diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index ecf5545bc..60d0d61b4 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -45,6 +45,7 @@ Expr, WindowFrame, ) +from .html_formatter import configure_formatter from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream @@ -76,6 +77,7 @@ "col", "column", "common", + "configure_formatter", "expr", "functions", "lit", diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py new file mode 100644 index 000000000..a50e14fd5 --- /dev/null +++ b/python/datafusion/html_formatter.py @@ -0,0 +1,647 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""HTML formatting utilities for DataFusion DataFrames.""" + +from __future__ import annotations + +from typing import ( + Any, + Callable, + Optional, + Protocol, + runtime_checkable, +) + + +@runtime_checkable +class CellFormatter(Protocol): + """Protocol for cell value formatters.""" + + def __call__(self, value: Any) -> str: + """Format a cell value to string representation.""" + ... + + +@runtime_checkable +class StyleProvider(Protocol): + """Protocol for HTML style providers.""" + + def get_cell_style(self) -> str: + """Get the CSS style for table cells.""" + ... + + def get_header_style(self) -> str: + """Get the CSS style for header cells.""" + ... + + +class DefaultStyleProvider: + """Default implementation of StyleProvider.""" + + def get_cell_style(self) -> str: + """Get the CSS style for table cells. + + Returns: + CSS style string + """ + return ( + "border: 1px solid black; padding: 8px; text-align: left; " + "white-space: nowrap;" + ) + + def get_header_style(self) -> str: + """Get the CSS style for header cells. + + Returns: + CSS style string + """ + return ( + "border: 1px solid black; padding: 8px; text-align: left; " + "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; " + "max-width: fit-content;" + ) + + +class DataFrameHtmlFormatter: + """Configurable HTML formatter for DataFusion DataFrames. + + This class handles the HTML rendering of DataFrames for display in + Jupyter notebooks and other rich display contexts. + + This class supports extension through composition. Key extension points: + - Provide a custom StyleProvider for styling cells and headers + - Register custom formatters for specific types + - Provide custom cell builders for specialized cell rendering + + Args: + max_cell_length: Maximum characters to display in a cell before truncation + max_width: Maximum width of the HTML table in pixels + max_height: Maximum height of the HTML table in pixels + enable_cell_expansion: Whether to add expand/collapse buttons for long cell + values + custom_css: Additional CSS to include in the HTML output + show_truncation_message: Whether to display a message when data is truncated + style_provider: Custom provider for cell and header styles + use_shared_styles: Whether to load styles and scripts only once per notebook + session + """ + + # Class variable to track if styles have been loaded in the notebook + _styles_loaded = False + + def __init__( + self, + max_cell_length: int = 25, + max_width: int = 1000, + max_height: int = 300, + enable_cell_expansion: bool = True, + custom_css: Optional[str] = None, + show_truncation_message: bool = True, + style_provider: Optional[StyleProvider] = None, + use_shared_styles: bool = True, + ) -> None: + """Initialize the HTML formatter. + + Parameters + ---------- + max_cell_length : int, default 25 + Maximum length of cell content before truncation. + max_width : int, default 1000 + Maximum width of the displayed table in pixels. + max_height : int, default 300 + Maximum height of the displayed table in pixels. + enable_cell_expansion : bool, default True + Whether to allow cells to expand when clicked. + custom_css : str, optional + Custom CSS to apply to the HTML table. + show_truncation_message : bool, default True + Whether to show a message indicating that content has been truncated. + style_provider : StyleProvider, optional + Provider of CSS styles for the HTML table. If None, DefaultStyleProvider + is used. + use_shared_styles : bool, default True + Whether to use shared styles across multiple tables. + + Raises: + ------ + ValueError + If max_cell_length, max_width, or max_height is not a positive integer. + TypeError + If enable_cell_expansion, show_truncation_message, or use_shared_styles is + not a boolean, + or if custom_css is provided but is not a string, + or if style_provider is provided but does not implement the StyleProvider + protocol. + """ + # Validate numeric parameters + + if not isinstance(max_cell_length, int) or max_cell_length <= 0: + msg = "max_cell_length must be a positive integer" + raise ValueError(msg) + if not isinstance(max_width, int) or max_width <= 0: + msg = "max_width must be a positive integer" + raise ValueError(msg) + if not isinstance(max_height, int) or max_height <= 0: + msg = "max_height must be a positive integer" + raise ValueError(msg) + + # Validate boolean parameters + if not isinstance(enable_cell_expansion, bool): + msg = "enable_cell_expansion must be a boolean" + raise TypeError(msg) + if not isinstance(show_truncation_message, bool): + msg = "show_truncation_message must be a boolean" + raise TypeError(msg) + if not isinstance(use_shared_styles, bool): + msg = "use_shared_styles must be a boolean" + raise TypeError(msg) + + # Validate custom_css + if custom_css is not None and not isinstance(custom_css, str): + msg = "custom_css must be None or a string" + raise TypeError(msg) + + # Validate style_provider + if style_provider is not None and not isinstance(style_provider, StyleProvider): + msg = "style_provider must implement the StyleProvider protocol" + raise TypeError(msg) + + self.max_cell_length = max_cell_length + self.max_width = max_width + self.max_height = max_height + self.enable_cell_expansion = enable_cell_expansion + self.custom_css = custom_css + self.show_truncation_message = show_truncation_message + self.style_provider = style_provider or DefaultStyleProvider() + self.use_shared_styles = use_shared_styles + # Registry for custom type formatters + self._type_formatters: dict[type, CellFormatter] = {} + # Custom cell builders + self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None + self._custom_header_builder: Optional[Callable[[Any], str]] = None + + def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: + """Register a custom formatter for a specific data type. + + Args: + type_class: The type to register a formatter for + formatter: Function that takes a value of the given type and returns + a formatted string + """ + self._type_formatters[type_class] = formatter + + def set_custom_cell_builder( + self, builder: Callable[[Any, int, int, str], str] + ) -> None: + """Set a custom cell builder function. + + Args: + builder: Function that takes (value, row, col, table_id) and returns HTML + """ + self._custom_cell_builder = builder + + def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: + """Set a custom header builder function. + + Args: + builder: Function that takes a field and returns HTML + """ + self._custom_header_builder = builder + + @classmethod + def is_styles_loaded(cls) -> bool: + """Check if HTML styles have been loaded in the current session. + + This method is primarily intended for debugging UI rendering issues + related to style loading. + + Returns: + True if styles have been loaded, False otherwise + + Example: + >>> from datafusion.html_formatter import DataFrameHtmlFormatter + >>> DataFrameHtmlFormatter.is_styles_loaded() + False + """ + return cls._styles_loaded + + def format_html( + self, + batches: list, + schema: Any, + has_more: bool = False, + table_uuid: str | None = None, + ) -> str: + """Format record batches as HTML. + + This method is used by DataFrame's _repr_html_ implementation and can be + called directly when custom HTML rendering is needed. + + Args: + batches: List of Arrow RecordBatch objects + schema: Arrow Schema object + has_more: Whether there are more batches not shown + table_uuid: Unique ID for the table, used for JavaScript interactions + + Returns: + HTML string representation of the data + + Raises: + TypeError: If schema is invalid and no batches are provided + """ + if not batches: + return "No data to display" + + # Validate schema + if schema is None or not hasattr(schema, "__iter__"): + msg = "Schema must be provided" + raise TypeError(msg) + + # Generate a unique ID if none provided + table_uuid = table_uuid or f"df-{id(batches)}" + + # Build HTML components + html = [] + + # Only include styles and scripts if: + # 1. Not using shared styles, OR + # 2. Using shared styles but they haven't been loaded yet + include_styles = ( + not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded + ) + + if include_styles: + html.extend(self._build_html_header()) + # If we're using shared styles, mark them as loaded + if self.use_shared_styles: + DataFrameHtmlFormatter._styles_loaded = True + + html.extend(self._build_table_container_start()) + + # Add table header and body + html.extend(self._build_table_header(schema)) + html.extend(self._build_table_body(batches, table_uuid)) + + html.append("") + html.append("") + + # Add footer (JavaScript and messages) + if include_styles and self.enable_cell_expansion: + html.append(self._get_javascript()) + + # Always add truncation message if needed (independent of styles) + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return "\n".join(html) + + def _build_html_header(self) -> list[str]: + """Build the HTML header with CSS styles.""" + html = [] + html.append("") + return html + + def _build_table_container_start(self) -> list[str]: + """Build the opening tags for the table container.""" + html = [] + html.append( + f'
' + ) + html.append('') + return html + + def _build_table_header(self, schema: Any) -> list[str]: + """Build the HTML table header with column names.""" + html = [] + html.append("") + html.append("") + for field in schema: + if self._custom_header_builder: + html.append(self._custom_header_builder(field)) + else: + html.append( + f"" + ) + html.append("") + html.append("") + return html + + def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: + """Build the HTML table body with data rows.""" + html = [] + html.append("") + + row_count = 0 + for batch in batches: + for row_idx in range(batch.num_rows): + row_count += 1 + html.append("") + + for col_idx, column in enumerate(batch.columns): + # Get the raw value from the column + raw_value = self._get_cell_value(column, row_idx) + + # Always check for type formatters first to format the value + formatted_value = self._format_cell_value(raw_value) + + # Then apply either custom cell builder or standard cell formatting + if self._custom_cell_builder: + # Pass both the raw value and formatted value to let the + # builder decide + cell_html = self._custom_cell_builder( + raw_value, row_count, col_idx, table_uuid + ) + html.append(cell_html) + else: + # Standard cell formatting with formatted value + if ( + len(str(raw_value)) > self.max_cell_length + and self.enable_cell_expansion + ): + cell_html = self._build_expandable_cell( + formatted_value, row_count, col_idx, table_uuid + ) + else: + cell_html = self._build_regular_cell(formatted_value) + html.append(cell_html) + + html.append("") + + html.append("") + return html + + def _get_cell_value(self, column: Any, row_idx: int) -> Any: + """Extract a cell value from a column. + + Args: + column: Arrow array + row_idx: Row index + + Returns: + The raw cell value + """ + try: + value = column[row_idx] + + if hasattr(value, "as_py"): + return value.as_py() + except (AttributeError, TypeError): + pass + else: + return value + + def _format_cell_value(self, value: Any) -> str: + """Format a cell value for display. + + Uses registered type formatters if available. + + Args: + value: The cell value to format + + Returns: + Formatted cell value as string + """ + # Check for custom type formatters + for type_cls, formatter in self._type_formatters.items(): + if isinstance(value, type_cls): + return formatter(value) + + # If no formatter matched, return string representation + return str(value) + + def _build_expandable_cell( + self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str + ) -> str: + """Build an expandable cell for long content.""" + short_value = str(formatted_value)[: self.max_cell_length] + return ( + f"" + ) + + def _build_regular_cell(self, formatted_value: str) -> str: + """Build a regular table cell.""" + return ( + f"" + ) + + def _build_html_footer(self, has_more: bool) -> list[str]: + """Build the HTML footer with JavaScript and messages.""" + html = [] + + # Add JavaScript for interactivity only if cell expansion is enabled + # and we're not using the shared styles approach + if self.enable_cell_expansion and not self.use_shared_styles: + html.append(self._get_javascript()) + + # Add truncation message if needed + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return html + + def _get_default_css(self) -> str: + """Get default CSS styles for the HTML table.""" + return """ + .expandable-container { + display: inline-block; + max-width: 200px; + } + .expandable { + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + display: block; + } + .full-text { + display: none; + white-space: normal; + } + .expand-btn { + cursor: pointer; + color: blue; + text-decoration: underline; + border: none; + background: none; + font-size: inherit; + display: block; + margin-top: 5px; + } + """ + + def _get_javascript(self) -> str: + """Get JavaScript code for interactive elements.""" + return """ + + """ + + +class FormatterManager: + """Manager class for the global DataFrame HTML formatter instance.""" + + _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() + + @classmethod + def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + """ + cls._default_formatter = formatter + _refresh_formatter_reference() + + @classmethod + def get_formatter(cls) -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + Returns: + The global HTML formatter instance + """ + return cls._default_formatter + + +def get_formatter() -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + This function is used by the DataFrame._repr_html_ implementation to access + the shared formatter instance. It can also be used directly when custom + HTML rendering is needed. + + Returns: + The global HTML formatter instance + + Example: + >>> from datafusion.html_formatter import get_formatter + >>> formatter = get_formatter() + >>> formatter.max_cell_length = 50 # Increase cell length + """ + return FormatterManager.get_formatter() + + +def set_formatter(formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + + Example: + >>> from datafusion.html_formatter import get_formatter, set_formatter + >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) + >>> set_formatter(custom_formatter) + """ + FormatterManager.set_formatter(formatter) + + +def configure_formatter(**kwargs: Any) -> None: + """Configure the global DataFrame HTML formatter. + + This function creates a new formatter with the provided configuration + and sets it as the global formatter for all DataFrames. + + Args: + **kwargs: Formatter configuration parameters like max_cell_length, + max_width, max_height, enable_cell_expansion, etc. + + Example: + >>> from datafusion.html_formatter import configure_formatter + >>> configure_formatter( + ... max_cell_length=50, + ... max_height=500, + ... enable_cell_expansion=True, + ... use_shared_styles=True + ... ) + """ + set_formatter(DataFrameHtmlFormatter(**kwargs)) + + +def reset_formatter() -> None: + """Reset the global DataFrame HTML formatter to default settings. + + This function creates a new formatter with default configuration + and sets it as the global formatter for all DataFrames. + + Example: + >>> from datafusion.html_formatter import reset_formatter + >>> reset_formatter() # Reset formatter to default settings + """ + formatter = DataFrameHtmlFormatter() + # Reset the styles_loaded flag to ensure styles will be reloaded + DataFrameHtmlFormatter._styles_loaded = False + set_formatter(formatter) + + +def reset_styles_loaded_state() -> None: + """Reset the styles loaded state to force reloading of styles. + + This can be useful when switching between notebook sessions or + when styles need to be refreshed. + + Example: + >>> from datafusion.html_formatter import reset_styles_loaded_state + >>> reset_styles_loaded_state() # Force styles to reload in next render + """ + DataFrameHtmlFormatter._styles_loaded = False + + +def _refresh_formatter_reference() -> None: + """Refresh formatter reference in any modules using it. + + This helps ensure that changes to the formatter are reflected in existing + DataFrames that might be caching the formatter reference. + """ + # This is a no-op but signals modules to refresh their reference diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index eda13930d..464b884db 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -28,8 +28,17 @@ column, literal, ) -from datafusion import functions as f +from datafusion import ( + functions as f, +) from datafusion.expr import Window +from datafusion.html_formatter import ( + DataFrameHtmlFormatter, + configure_formatter, + get_formatter, + reset_formatter, + reset_styles_loaded_state, +) from pyarrow.csv import write_csv @@ -102,6 +111,12 @@ def partitioned_df(): return ctx.create_dataframe([[batch]]) +@pytest.fixture +def clean_formatter_state(): + """Reset the HTML formatter after each test.""" + reset_formatter() + + def test_select(df): df_1 = df.select( column("a") + column("b"), @@ -656,6 +671,252 @@ def test_window_frame_defaults_match_postgres(partitioned_df): assert df_2.sort(col_a).to_pydict() == expected +def test_html_formatter_configuration(df, clean_formatter_state): + """Test configuring the HTML formatter with different options.""" + # Configure with custom settings + configure_formatter( + max_cell_length=5, + max_width=500, + max_height=200, + enable_cell_expansion=False, + ) + + html_output = df._repr_html_() + + # Verify our configuration was applied + assert "max-height: 200px" in html_output + assert "max-width: 500px" in html_output + # With cell expansion disabled, we shouldn't see expandable-container elements + assert "expandable-container" not in html_output + + +def test_html_formatter_custom_style_provider(df, clean_formatter_state): + """Test using custom style providers with the HTML formatter.""" + + class CustomStyleProvider: + def get_cell_style(self) -> str: + return ( + "background-color: #f5f5f5; color: #333; padding: 8px; border: " + "1px solid #ddd;" + ) + + def get_header_style(self) -> str: + return ( + "background-color: #4285f4; color: white; font-weight: bold; " + "padding: 10px; border: 1px solid #3367d6;" + ) + + # Configure with custom style provider + configure_formatter(style_provider=CustomStyleProvider()) + + html_output = df._repr_html_() + + # Verify our custom styles were applied + assert "background-color: #4285f4" in html_output + assert "color: white" in html_output + assert "background-color: #f5f5f5" in html_output + + +def test_html_formatter_type_formatters(df, clean_formatter_state): + """Test registering custom type formatters for specific data types.""" + + # Get current formatter and register custom formatters + formatter = get_formatter() + + # Format integers with color based on value + # Using int as the type for the formatter will work since we convert + # Arrow scalar values to Python native types in _get_cell_value + def format_int(value): + return f' 2 else "blue"}">{value}' + + formatter.register_formatter(int, format_int) + + html_output = df._repr_html_() + + # Our test dataframe has values 1,2,3 so we should see: + assert '1' in html_output + + +def test_html_formatter_custom_cell_builder(df, clean_formatter_state): + """Test using a custom cell builder function.""" + + # Create a custom cell builder with distinct styling for different value ranges + def custom_cell_builder(value, row, col, table_id): + try: + num_value = int(value) + if num_value > 5: # Values > 5 get green background with indicator + return ( + '' + ) + if num_value < 3: # Values < 3 get blue background with indicator + return ( + '' + ) + except (ValueError, TypeError): + pass + + # Default styling for other cells (3, 4, 5) + return f'' + + # Set our custom cell builder + formatter = get_formatter() + formatter.set_custom_cell_builder(custom_cell_builder) + + html_output = df._repr_html_() + + # Extract cells with specific styling using regex + low_cells = re.findall( + r'', html_output + ) + mid_cells = re.findall( + r'', html_output + ) + high_cells = re.findall( + r'', html_output + ) + + # Sort the extracted values for consistent comparison + low_cells = sorted(map(int, low_cells)) + mid_cells = sorted(map(int, mid_cells)) + high_cells = sorted(map(int, high_cells)) + + # Verify specific values have the correct styling applied + assert low_cells == [1, 2] # Values < 3 + assert mid_cells == [3, 4, 5, 5] # Values 3-5 + assert high_cells == [6, 8, 8] # Values > 5 + + # Verify the exact content with styling appears in the output + assert ( + '' + in html_output + ) + assert ( + '' + in html_output + ) + assert ( + '' in html_output + ) + assert ( + '' in html_output + ) + assert ( + '' + in html_output + ) + assert ( + '' + in html_output + ) + + # Count occurrences to ensure all cells are properly styled + assert html_output.count("-low") == 2 # Two low values (1, 2) + assert html_output.count("-mid") == 4 # Four mid values (3, 4, 5, 5) + assert html_output.count("-high") == 3 # Three high values (6, 8, 8) + + # Create a custom cell builder that changes background color based on value + def custom_cell_builder(value, row, col, table_id): + # Handle numeric values regardless of their exact type + try: + num_value = int(value) + if num_value > 5: # Values > 5 get green background + return f'' + if num_value < 3: # Values < 3 get light blue background + return f'' + except (ValueError, TypeError): + pass + + # Default styling for other cells + return f'' + + # Set our custom cell builder + formatter = get_formatter() + formatter.set_custom_cell_builder(custom_cell_builder) + + html_output = df._repr_html_() + + # Verify our custom cell styling was applied + assert "background-color: #d3e9f0" in html_output # For values 1,2 + + +def test_html_formatter_custom_header_builder(df, clean_formatter_state): + """Test using a custom header builder function.""" + + # Create a custom header builder with tooltips + def custom_header_builder(field): + tooltips = { + "a": "Primary key column", + "b": "Secondary values", + "c": "Additional data", + } + tooltip = tooltips.get(field.name, "") + return ( + f'' + ) + + # Set our custom header builder + formatter = get_formatter() + formatter.set_custom_header_builder(custom_header_builder) + + html_output = df._repr_html_() + + # Verify our custom headers were applied + assert 'title="Primary key column"' in html_output + assert 'title="Secondary values"' in html_output + assert "background-color: #333; color: white" in html_output + + +def test_html_formatter_complex_customization(df, clean_formatter_state): + """Test combining multiple customization options together.""" + + # Create a dark mode style provider + class DarkModeStyleProvider: + def get_cell_style(self) -> str: + return ( + "background-color: #222; color: #eee; " + "padding: 8px; border: 1px solid #444;" + ) + + def get_header_style(self) -> str: + return ( + "background-color: #111; color: #fff; padding: 10px; " + "border: 1px solid #333;" + ) + + # Configure with dark mode style + configure_formatter( + max_cell_length=10, + style_provider=DarkModeStyleProvider(), + custom_css=""" + .datafusion-table { + font-family: monospace; + border-collapse: collapse; + } + .datafusion-table tr:hover td { + background-color: #444 !important; + } + """, + ) + + # Add type formatters for special formatting - now working with native int values + formatter = get_formatter() + formatter.register_formatter( + int, + lambda n: f'{n}', + ) + + html_output = df._repr_html_() + + # Verify our customizations were applied + assert "background-color: #222" in html_output + assert "background-color: #111" in html_output + assert ".datafusion-table" in html_output + assert "color: #5af" in html_output # Even numbers + + def test_get_dataframe(tmp_path): ctx = SessionContext() @@ -1244,7 +1505,10 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: assert result["new_col"] == [3 for _i in range(3)] -def test_dataframe_repr_html(df) -> None: +def test_dataframe_repr_html_structure(df) -> None: + """Test that DataFrame._repr_html_ produces expected HTML output structure.""" + import re + output = df._repr_html_() # Since we've added a fair bit of processing to the html output, lets just verify @@ -1255,9 +1519,131 @@ def test_dataframe_repr_html(df) -> None: headers = ["a", "b", "c"] headers = [f"{v}" for v in headers] header_pattern = "(.*?)".join(headers) - assert len(re.findall(header_pattern, output, re.DOTALL)) == 1 + header_matches = re.findall(header_pattern, output, re.DOTALL) + assert len(header_matches) == 1 + # Update the pattern to handle values that may be wrapped in spans body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]] - body_lines = [f"{v}" for inner in body_data for v in inner] + + body_lines = [ + f"(?:]*?>)?{v}(?:)?" + for inner in body_data + for v in inner + ] body_pattern = "(.*?)".join(body_lines) - assert len(re.findall(body_pattern, output, re.DOTALL)) == 1 + + body_matches = re.findall(body_pattern, output, re.DOTALL) + + assert len(body_matches) == 1, "Expected pattern of values not found in HTML output" + + +def test_dataframe_repr_html_values(df): + """Test that DataFrame._repr_html_ contains the expected data values.""" + html = df._repr_html_() + assert html is not None + + # Create a more flexible pattern that handles values being wrapped in spans + # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless + # of formatting + pattern = re.compile( + r"]*?>(?:]*?>)?1(?:)?.*?" + r"]*?>(?:]*?>)?4(?:)?.*?" + r"]*?>(?:]*?>)?8(?:)?.*?" + r"]*?>(?:]*?>)?2(?:)?.*?" + r"]*?>(?:]*?>)?5(?:)?.*?" + r"]*?>(?:]*?>)?5(?:)?.*?" + r"]*?>(?:]*?>)?3(?:)?.*?" + r"]*?>(?:]*?>)?6(?:)?.*?" + r"]*?>(?:]*?>)?8(?:)?", + re.DOTALL, + ) + + # Print debug info if the test fails + matches = re.findall(pattern, html) + if not matches: + print(f"HTML output snippet: {html[:500]}...") # noqa: T201 + + assert len(matches) > 0, "Expected pattern of values not found in HTML output" + + +def test_html_formatter_shared_styles(df, clean_formatter_state): + """Test that shared styles work correctly across multiple tables.""" + + # First, ensure we're using shared styles + configure_formatter(use_shared_styles=True) + + # Get HTML output for first table - should include styles + html_first = df._repr_html_() + + # Verify styles are included in first render + assert " + // Convert record batches to PyObject list + let py_batches = batches + .into_iter() + .map(|rb| rb.to_pyarrow(py)) + .collect::>>()?; -
-
" + f"{field.name}
" + f"
" + "" + "" + f"{formatted_value}" + f"" + f"
" + f"
{formatted_value}{value}-high{value}-low{value}-mid]*>(\d+)-low]*>(\d+)-mid]*>(\d+)-high1-low2-low3-mid4-mid6-high8-high{value}{value}{value}{field.name}
- \n".to_string(); + let py_schema = self.schema().into_pyobject(py)?; - let schema = batches[0].schema(); + // Get the Python formatter module and call format_html + let formatter_module = py.import("datafusion.html_formatter")?; + let get_formatter = formatter_module.getattr("get_formatter")?; + let formatter = get_formatter.call0()?; - let mut header = Vec::new(); - for field in schema.fields() { - header.push(format!("", field.name())); - } - let header_str = header.join(""); - html_str.push_str(&format!("{}\n", header_str)); - - let batch_formatters = batches - .iter() - .map(|batch| { - batch - .columns() - .iter() - .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) - .map(|c| { - c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) - }) - .collect::, _>>() - }) - .collect::, _>>()?; - - let rows_per_batch = batches.iter().map(|batch| batch.num_rows()); - - // We need to build up row by row for html - let mut table_row = 0; - for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) { - for batch_row in 0..num_rows_in_batch { - table_row += 1; - let mut cells = Vec::new(); - for (col, formatter) in batch_formatter.iter().enumerate() { - let cell_data = formatter.value(batch_row).to_string(); - // From testing, primitive data types do not typically get larger than 21 characters - if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE { - let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE]; - cells.push(format!(" - ")); - } else { - cells.push(format!("", formatter.value(batch_row))); - } - } - let row_str = cells.join(""); - html_str.push_str(&format!("{}\n", row_str)); - } - } - html_str.push_str("
{}
-
- {short_cell_data} - {cell_data} - -
-
{}
\n"); - - html_str.push_str(" - - "); + // Call format_html method on the formatter + let kwargs = pyo3::types::PyDict::new(py); + let py_batches_list = PyList::new(py, py_batches.as_slice())?; + kwargs.set_item("batches", py_batches_list)?; + kwargs.set_item("schema", py_schema)?; + kwargs.set_item("has_more", has_more)?; + kwargs.set_item("table_uuid", table_uuid)?; - if has_more { - html_str.push_str("Data truncated due to size."); - } + let html_result = formatter.call_method("format_html", (), Some(&kwargs))?; + let html_str: String = html_result.extract()?; Ok(html_str) } @@ -835,7 +747,7 @@ fn record_batch_into_schema( ) -> Result { let schema = Arc::new(schema.clone()); let base_schema = record_batch.schema(); - if base_schema.fields().len() == 0 { + if base_schema.fields().is_empty() { // Nothing to project return Ok(RecordBatch::new_empty(schema)); } From d0d14f6e1584f9569cbf2e36c8a7abc7c70fd903 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 24 Apr 2025 09:38:38 -0400 Subject: [PATCH 122/248] feat: update datafusion dependency 47 (#1107) * Update cargo to use DF47 release candidate * Need to be explicit for collection of Expr due to change in dataframe API * Add missing enum variant * Add missing enum variants * The interface for last_value of aggregates upstream changed * Cargo fmt * last value aggregate without ordering is ill defined * Clippy warning * Set datafusion version to 47 now that it is released --- Cargo.lock | 600 +++++++++++++++++++------------ Cargo.toml | 24 +- python/tests/test_aggregation.py | 1 - src/dataframe.rs | 2 +- src/dataset_exec.rs | 4 +- src/expr.rs | 16 +- src/functions.rs | 37 +- 7 files changed, 415 insertions(+), 269 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f90038c50..b32d19d4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84ef243634a39fb6e9d1710737e7a5ef96c9bacabd2326859ff889bc9ef755e5" +checksum = "3095aaf545942ff5abd46654534f15b03a90fba78299d661e045e5d587222f0d" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f420c6aef51dad2e4a96ce29c0ec90ad84880bdb60b321c74c652a6be07b93f" +checksum = "00752064ff47cee746e816ddb8450520c3a52cbad1e256f6fa861a35f86c45e7" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24bda5ff6461a4ff9739959b3d57b377f45e3f878f7be1a4f28137c0a8f339fa" +checksum = "cebfe926794fbc1f49ddd0cdaf898956ca9f6e79541efce62dabccfd81380472" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6ed265c73f134a583d02c3cab5e16afab9446d8048ede8707e31f85fad58a0" +checksum = "0303c7ec4cf1a2c60310fc4d6bbc3350cd051a17bf9e9c0a8e47b4db79277824" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01c648572391edcef10e5fd458db70ba27ed6f71bcaee04397d0cfb100b34f8b" +checksum = "335f769c5a218ea823d3760a743feba1ef7857cba114c01399a891c2fff34285" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a02fb265a6d8011a7d3ad1a36f25816ad0a3bb04cb8e9fe7929c165b98c0cbcd" +checksum = "510db7dfbb4d5761826516cc611d97b3a68835d0ece95b034a052601109c0b1b" dependencies = [ "arrow-array", "arrow-cast", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f2cebf504bb6a92a134a87fff98f01b14fbb3a93ecf7aef90cd0f888c5fffa4" +checksum = "e8affacf3351a24039ea24adab06f316ded523b6f8c3dbe28fbac5f18743451b" dependencies = [ "arrow-buffer", "arrow-schema", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6405b287671c88846e7751f7291f717b164911474cabac6d3d8614d5aa7374" +checksum = "69880a9e6934d9cba2b8630dd08a3463a91db8693b16b499d54026b6137af284" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5329bf9e7390cbb6b117ddd4d82e94c5362ea4cab5095697139429f36a38350c" +checksum = "d8dafd17a05449e31e0114d740530e0ada7379d7cb9c338fd65b09a8130960b0" dependencies = [ "arrow-array", "arrow-buffer", @@ -328,9 +328,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e103c13d4b80da28339c1d7aa23dd85bd59f42158acc45d39eeb6770627909ce" +checksum = "895644523af4e17502d42c3cb6b27cb820f0cb77954c22d75c23a85247c849e1" dependencies = [ "arrow-array", "arrow-buffer", @@ -341,9 +341,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "170549a11b8534f3097a0619cfe89c42812345dc998bcf81128fc700b84345b8" +checksum = "9be8a2a4e5e7d9c822b2b8095ecd77010576d824f654d347817640acfc97d229" dependencies = [ "arrow-array", "arrow-buffer", @@ -354,18 +354,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5c53775bba63f319189f366d2b86e9a8889373eb198f07d8544938fc9f8ed9a" +checksum = "7450c76ab7c5a6805be3440dc2e2096010da58f7cab301fdc996a4ee3ee74e49" dependencies = [ - "bitflags 2.8.0", + "bitflags", ] [[package]] name = "arrow-select" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a99003b2eb562b8d9c99dfb672306f15e94b20d3734179d596895703e821dcf" +checksum = "aa5f5a93c75f46ef48e4001535e7b6c922eeb0aa20b73cf58d09e13d057490d8" dependencies = [ "ahash", "arrow-array", @@ -377,9 +377,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fdb130ee8325f4cd8262e19bb6baa3cbcef2b2573c4bee8c6fda7ea08199d7" +checksum = "6e7005d858d84b56428ba2a98a107fe88c0132c61793cf6b8232a1f9bfc0452b" dependencies = [ "arrow-array", "arrow-buffer", @@ -406,11 +406,11 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.18" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" dependencies = [ - "bzip2 0.4.4", + "bzip2 0.5.2", "flate2", "futures-core", "memchr", @@ -438,18 +438,18 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "async-trait" -version = "0.1.86" +version = "0.1.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" +checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -502,9 +502,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bigdecimal" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f31f3af01c5c65a07985c804d3366560e6fa7883d640a122819b14ec327482c" +checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" dependencies = [ "autocfg", "libm", @@ -514,12 +514,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.8.0" @@ -537,9 +531,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17679a8d69b6d7fd9cd9801a536cec9fa5e5970b69f9d4747f70b39b031f5e7" +checksum = "389a099b34312839e16420d499a9cad9650541715937ffbdd40d36f49e77eeb3" dependencies = [ "arrayref", "arrayvec", @@ -608,21 +602,20 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75b89e7c29231c673a61a46e722602bcd138298f6b9e81e71119693534585f5c" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" dependencies = [ "bzip2-sys", ] [[package]] name = "bzip2-sys" -version = "0.1.12+1.0.8" +version = "0.1.13+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ebc2f1a417f01e1da30ef264ee86ae31d2dcd2d603ea283d3c244a883ca2a9" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" dependencies = [ "cc", - "libc", "pkg-config", ] @@ -866,23 +859,26 @@ dependencies = [ [[package]] name = "datafusion" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "914e6f9525599579abbd90b0f7a55afcaaaa40350b9e9ed52563f126dfe45fd3" +checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080" dependencies = [ - "apache-avro", "arrow", "arrow-ipc", "arrow-schema", "async-trait", "bytes", - "bzip2 0.5.1", + "bzip2 0.5.2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", + "datafusion-datasource-avro", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -897,12 +893,12 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-optimizer", "datafusion-physical-plan", + "datafusion-session", "datafusion-sql", "flate2", "futures", "itertools 0.14.0", "log", - "num-traits", "object_store", "parking_lot", "parquet", @@ -919,29 +915,35 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "998a6549e6ee4ee3980e05590b2960446a56b343ea30199ef38acd0e0b9036e2" +checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66" dependencies = [ "arrow", "async-trait", "dashmap", "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr", "datafusion-physical-plan", + "datafusion-session", "datafusion-sql", "futures", "itertools 0.14.0", "log", + "object_store", "parking_lot", + "tokio", ] [[package]] name = "datafusion-catalog-listing" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5ac10096a5b3c0d8a227176c0e543606860842e943594ccddb45cf42a526e43" +checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00" dependencies = [ "arrow", "async-trait", @@ -953,6 +955,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", + "datafusion-session", "futures", "log", "object_store", @@ -961,9 +964,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f53d7ec508e1b3f68bd301cee3f649834fad51eff9240d898a4b2614cfd0a7a" +checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c" dependencies = [ "ahash", "apache-avro", @@ -986,27 +989,27 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0fcf41523b22e14cc349b01526e8b9f59206653037f2949a4adbfde5f8cb668" +checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b" dependencies = [ + "futures", "log", "tokio", ] [[package]] name = "datafusion-datasource" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf7f37ad8b6e88b46c7eeab3236147d32ea64b823544f498455a8d9042839c92" +checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1" dependencies = [ "arrow", "async-compression", "async-trait", "bytes", - "bzip2 0.5.1", + "bzip2 0.5.2", "chrono", - "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", @@ -1014,13 +1017,16 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", + "datafusion-session", "flate2", "futures", "glob", "itertools 0.14.0", "log", "object_store", + "parquet", "rand", + "tempfile", "tokio", "tokio-util", "url", @@ -1028,17 +1034,123 @@ dependencies = [ "zstd", ] +[[package]] +name = "datafusion-datasource-avro" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4ea5111aab9d3f2a8bff570343cccb03ce4c203875ef5a566b7d6f1eb72559e" +dependencies = [ + "apache-avro", + "arrow", + "async-trait", + "bytes", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "num-traits", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27d15868ea39ed2dc266728b554f6304acd473de2142281ecfa1294bb7415923" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet", + "rand", + "tokio", +] + [[package]] name = "datafusion-doc" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7db7a0239fd060f359dc56c6e7db726abaa92babaed2fb2e91c3a8b2fff8b256" +checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292" [[package]] name = "datafusion-execution" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0938f9e5b6bc5782be4111cdfb70c02b7b5451bf34fd57e4de062a7f7c4e31f1" +checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84" dependencies = [ "arrow", "dashmap", @@ -1055,9 +1167,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b36c28b00b00019a8695ad7f1a53ee1673487b90322ecbd604e2cf32894eb14f" +checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964" dependencies = [ "arrow", "chrono", @@ -1076,9 +1188,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18f0a851a436c5a2139189eb4617a54e6a9ccb9edc96c4b3c83b3bb7c58b950e" +checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" dependencies = [ "arrow", "datafusion-common", @@ -1089,12 +1201,13 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d740dd9f32a4f4ed1b907e6934201bb059efe6c877532512c661771d973c7b21" +checksum = "5cf3fe9ab492c56daeb7beed526690d33622d388b8870472e0b7b7f55490338c" dependencies = [ "abi_stable", "arrow", + "arrow-schema", "async-ffi", "async-trait", "datafusion", @@ -1108,9 +1221,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3196e37d7b65469fb79fee4f05e5bb58a456831035f9a38aa5919aeb3298d40" +checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e" dependencies = [ "arrow", "arrow-buffer", @@ -1137,9 +1250,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adfc2d074d5ee4d9354fdcc9283d5b2b9037849237ddecb8942a29144b77ca05" +checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba" dependencies = [ "ahash", "arrow", @@ -1158,9 +1271,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cbceba0f98d921309a9121b702bcd49289d383684cccabf9a92cda1602f3bbb" +checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4" dependencies = [ "ahash", "arrow", @@ -1171,9 +1284,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "170e27ce4baa27113ddf5f77f1a7ec484b0dbeda0c7abbd4bad3fc609c8ab71a" +checksum = "8d8d50f6334b378930d992d801a10ac5b3e93b846b39e4a05085742572844537" dependencies = [ "arrow", "arrow-ord", @@ -1192,9 +1305,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d3a06a7f0817ded87b026a437e7e51de7f59d48173b0a4e803aa896a7bd6bb5" +checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a" dependencies = [ "arrow", "async-trait", @@ -1208,9 +1321,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6c608b66496a1e05e3d196131eb9bebea579eed1f59e88d962baf3dda853bc6" +checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1225,9 +1338,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da2f9d83348957b4ad0cd87b5cb9445f2651863a36592fe5484d43b49a5f8d82" +checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1235,20 +1348,20 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4800e1ff7ecf8f310887e9b54c9c444b8e215ccbc7b21c2f244cfae373b1ece7" +checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1" dependencies = [ "datafusion-expr", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "datafusion-optimizer" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "971c51c54cd309001376fae752fb15a6b41750b6d1552345c46afbfb6458801b" +checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515" dependencies = [ "arrow", "chrono", @@ -1265,9 +1378,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1447c2c6bc8674a16be4786b4abf528c302803fafa186aa6275692570e64d85" +checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65" dependencies = [ "ahash", "arrow", @@ -1287,9 +1400,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69f8c25dcd069073a75b3d2840a79d0f81e64bdd2c05f2d3d18939afb36a7dcb" +checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863" dependencies = [ "ahash", "arrow", @@ -1301,9 +1414,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68da5266b5b9847c11d1b3404ee96b1d423814e1973e1ad3789131e5ec912763" +checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2" dependencies = [ "arrow", "datafusion-common", @@ -1320,9 +1433,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cc160df00e413e370b3b259c8ea7bfbebc134d32de16325950e9e923846b7f" +checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55" dependencies = [ "ahash", "arrow", @@ -1350,9 +1463,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f6ef4c6eb52370cb48639e25e2331a415aac0b2b0a0a472b36e26603bdf184f" +checksum = "a4a1afb2bdb05de7ff65be6883ebfd4ec027bd9f1f21c46aa3afd01927160a83" dependencies = [ "arrow", "chrono", @@ -1366,9 +1479,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5faf4a9bbb0d0a305fea8a6db21ba863286b53e53a212e687d2774028dd6f03f" +checksum = "35b7a5876ebd6b564fb9a1fd2c3a2a9686b787071a256b47e4708f0916f9e46f" dependencies = [ "arrow", "datafusion-common", @@ -1398,11 +1511,35 @@ dependencies = [ "uuid", ] +[[package]] +name = "datafusion-session" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + [[package]] name = "datafusion-sql" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325a212b67b677c0eb91447bf9a11b630f9fc4f62d8e5d145bf859f5a6b29e64" +checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607" dependencies = [ "arrow", "bigdecimal", @@ -1417,9 +1554,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c2be3226a683e02cff65181e66e62eba9f812ed0e9b7ec8fe11ac8dabf1a73f" +checksum = "061efc0937f0ce3abb37ed0d56cfa01dd0e654b90e408656d05e846c8b7599fe" dependencies = [ "async-recursion", "async-trait", @@ -1453,7 +1590,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -1498,21 +1635,22 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "24.12.23" +version = "25.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" dependencies = [ - "bitflags 1.3.2", + "bitflags", "rustc_version", ] [[package]] name = "flate2" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" +checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece" dependencies = [ "crc32fast", + "libz-rs-sys", "miniz_oxide", ] @@ -1593,7 +1731,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -1703,9 +1841,9 @@ dependencies = [ [[package]] name = "half" -version = "2.4.1" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" dependencies = [ "cfg-if", "crunchy", @@ -1986,7 +2124,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2012,9 +2150,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.1" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -2207,6 +2345,15 @@ dependencies = [ "libc", ] +[[package]] +name = "libz-rs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6489ca9bd760fe9642d7644e827b0c9add07df89857b0416ee15c1cc1a3b8c5a" +dependencies = [ + "zlib-rs", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -2241,7 +2388,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" dependencies = [ - "twox-hash", + "twox-hash 1.6.3", ] [[package]] @@ -2297,9 +2444,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.4" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" +checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" dependencies = [ "adler2", ] @@ -2407,19 +2554,22 @@ dependencies = [ [[package]] name = "object_store" -version = "0.11.2" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" +checksum = "e9ce831b09395f933addbc56d894d889e4b226eba304d4e7adbab591e26daf1e" dependencies = [ "async-trait", "base64 0.22.1", "bytes", "chrono", + "form_urlencoded", "futures", + "http", + "http-body-util", "httparse", "humantime", "hyper", - "itertools 0.13.0", + "itertools 0.14.0", "md-5", "parking_lot", "percent-encoding", @@ -2430,7 +2580,8 @@ dependencies = [ "rustls-pemfile", "serde", "serde_json", - "snafu", + "serde_urlencoded", + "thiserror 2.0.11", "tokio", "tracing", "url", @@ -2483,9 +2634,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94243778210509a5a5e9e012872127180c155d73a9cd6e2df9243d213e81e100" +checksum = "cd31a8290ac5b19f09ad77ee7a1e6a541f1be7674ad410547d5f1eef6eef4a9c" dependencies = [ "ahash", "arrow-array", @@ -2513,7 +2664,7 @@ dependencies = [ "snap", "thrift", "tokio", - "twox-hash", + "twox-hash 2.1.0", "zstd", ] @@ -2658,12 +2809,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.29" +version = "0.2.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" +checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6" dependencies = [ "proc-macro2", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2692,7 +2843,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck", - "itertools 0.14.0", + "itertools 0.13.0", "log", "multimap", "once_cell", @@ -2701,7 +2852,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.98", + "syn 2.0.100", "tempfile", ] @@ -2712,10 +2863,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2747,9 +2898,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc" +checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229" dependencies = [ "cfg-if", "indoc", @@ -2765,9 +2916,9 @@ dependencies = [ [[package]] name = "pyo3-async-runtimes" -version = "0.23.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "977dc837525cfd22919ba6a831413854beb7c99a256c03bf8624ad707e45810e" +checksum = "dd0b83dc42f9d41f50d38180dad65f0c99763b65a3ff2a81bf351dd35a1df8bf" dependencies = [ "futures", "once_cell", @@ -2778,9 +2929,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7" +checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1" dependencies = [ "once_cell", "target-lexicon", @@ -2788,9 +2939,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d" +checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc" dependencies = [ "libc", "pyo3-build-config", @@ -2798,27 +2949,27 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7" +checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "pyo3-macros-backend" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4" +checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855" dependencies = [ "heck", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2891,9 +3042,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -2945,7 +3096,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2954,7 +3105,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ - "bitflags 2.8.0", + "bitflags", ] [[package]] @@ -3104,7 +3255,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.8.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -3198,9 +3349,9 @@ dependencies = [ [[package]] name = "schemars" -version = "0.8.21" +version = "0.8.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09c024468a378b7e36765cd36702b7a90cc3cba11654f6685c8f233408e89e92" +checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" dependencies = [ "dyn-clone", "schemars_derive", @@ -3210,14 +3361,14 @@ dependencies = [ [[package]] name = "schemars_derive" -version = "0.8.21" +version = "0.8.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1eee588578aff73f856ab961cd2f79e36bc45d7ded33a7562adba4667aecc0e" +checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3232,7 +3383,7 @@ version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" dependencies = [ - "bitflags 2.8.0", + "bitflags", "core-foundation", "core-foundation-sys", "libc", @@ -3251,9 +3402,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.25" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" dependencies = [ "serde", ] @@ -3266,9 +3417,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.217" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] @@ -3284,13 +3435,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.217" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3301,14 +3452,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "serde_json" -version = "1.0.138" +version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "itoa", "memchr", @@ -3325,7 +3476,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3397,27 +3548,6 @@ version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" -[[package]] -name = "snafu" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" -dependencies = [ - "snafu-derive", -] - -[[package]] -name = "snafu-derive" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn 2.0.98", -] - [[package]] name = "snap" version = "1.1.1" @@ -3436,9 +3566,9 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.54.0" +version = "0.55.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" +checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" dependencies = [ "log", "recursive", @@ -3453,7 +3583,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3497,14 +3627,14 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "substrait" -version = "0.53.2" +version = "0.55.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac3d70185423235f37b889764e184b81a5af4bb7c95833396ee9bd92577e1b" +checksum = "048fe52a3664881ccdfdc9bdb0f4e8805f3444ee64abf299d365c54f6a2ffabb" dependencies = [ "heck", "pbjson", @@ -3521,7 +3651,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.98", + "syn 2.0.100", "typify", "walkdir", ] @@ -3545,9 +3675,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.98" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -3571,14 +3701,14 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "target-lexicon" -version = "0.12.16" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" [[package]] name = "tempfile" @@ -3620,7 +3750,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3631,7 +3761,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3681,9 +3811,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.43.0" +version = "1.44.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" +checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48" dependencies = [ "backtrace", "bytes", @@ -3703,7 +3833,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3718,9 +3848,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.13" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" +checksum = "6b9590b93e6fcc1739458317cccd391ad3955e2bde8913edf6f95f9e65a8f034" dependencies = [ "bytes", "futures-core", @@ -3775,7 +3905,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3818,6 +3948,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "twox-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908" + [[package]] name = "typed-arena" version = "2.0.2" @@ -3841,7 +3977,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3875,7 +4011,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.98", + "syn 2.0.100", "thiserror 2.0.11", "unicode-ident", ] @@ -3893,7 +4029,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.98", + "syn 2.0.100", "typify-impl", ] @@ -4030,7 +4166,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "wasm-bindgen-shared", ] @@ -4065,7 +4201,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4276,7 +4412,7 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" dependencies = [ - "bitflags 2.8.0", + "bitflags", ] [[package]] @@ -4320,7 +4456,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "synstructure", ] @@ -4342,7 +4478,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -4362,7 +4498,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "synstructure", ] @@ -4391,9 +4527,15 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] +[[package]] +name = "zlib-rs" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "868b928d7949e09af2f6086dfc1e01936064cc7a819253bce650d4e2a2d63ba8" + [[package]] name = "zstd" version = "0.13.2" diff --git a/Cargo.toml b/Cargo.toml index bc8639d4c..2c4188bb0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,25 +34,25 @@ protoc = [ "datafusion-substrait/protoc" ] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.43", features = ["macros", "rt", "rt-multi-thread", "sync"] } -pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } -pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} -arrow = { version = "54.2.1", features = ["pyarrow"] } -datafusion = { version = "46.0.1", features = ["avro", "unicode_expressions"] } -datafusion-substrait = { version = "46.0.1", optional = true } -datafusion-proto = { version = "46.0.1" } -datafusion-ffi = { version = "46.0.1" } +tokio = { version = "1.44", features = ["macros", "rt", "rt-multi-thread", "sync"] } +pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] } +pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]} +arrow = { version = "55.0.0", features = ["pyarrow"] } +datafusion = { version = "47.0.0", features = ["avro", "unicode_expressions"] } +datafusion-substrait = { version = "47.0.0", optional = true } +datafusion-proto = { version = "47.0.0" } +datafusion-ffi = { version = "47.0.0" } prost = "0.13.1" # keep in line with `datafusion-substrait` -uuid = { version = "1.12", features = ["v4"] } +uuid = { version = "1.16", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } -async-trait = "0.1.73" +async-trait = "0.1.88" futures = "0.3" -object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] } +object_store = { version = "0.12.0", features = ["aws", "gcp", "azure", "http"] } url = "2" [build-dependencies] prost-types = "0.13.1" # keep in line with `datafusion-substrait` -pyo3-build-config = "0.23" +pyo3-build-config = "0.24" [lib] name = "datafusion_python" diff --git a/python/tests/test_aggregation.py b/python/tests/test_aggregation.py index 61b1c7d80..49dfb38cf 100644 --- a/python/tests/test_aggregation.py +++ b/python/tests/test_aggregation.py @@ -338,7 +338,6 @@ def test_bit_and_bool_fns(df, name, expr, result): ), [7, 9], ), - ("last_value", f.last_value(column("a")), [3, 6]), ( "last_value_ordered", f.last_value(column("a"), order_by=[column("a").sort(ascending=False)]), diff --git a/src/dataframe.rs b/src/dataframe.rs index 9b610b5d7..787f63520 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -216,7 +216,7 @@ impl PyDataFrame { #[pyo3(signature = (*args))] fn select(&self, args: Vec) -> PyDataFusionResult { - let expr = args.into_iter().map(|e| e.into()).collect(); + let expr: Vec = args.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().select(expr)?; Ok(Self::new(df)) } diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index 445e4fe74..aab8d7566 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -275,7 +275,9 @@ impl DisplayAs for DatasetExec { Python::with_gil(|py| { let number_of_fragments = self.fragments.bind(py).len(); match t { - DisplayFormatType::Default | DisplayFormatType::Verbose => { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { let projected_columns: Vec = self .schema .fields() diff --git a/src/expr.rs b/src/expr.rs index 561170289..fe0e76daa 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -714,9 +714,19 @@ impl PyExpr { | Operator::BitwiseXor | Operator::BitwiseAnd | Operator::BitwiseOr => DataTypeMap::map_from_arrow_type(&DataType::Binary), - Operator::AtArrow | Operator::ArrowAt => { - Err(py_type_err(format!("Unsupported expr: ${op}"))) - } + Operator::AtArrow + | Operator::ArrowAt + | Operator::Arrow + | Operator::LongArrow + | Operator::HashArrow + | Operator::HashLongArrow + | Operator::AtAt + | Operator::IntegerDivide + | Operator::HashMinus + | Operator::AtQuestion + | Operator::Question + | Operator::QuestionAnd + | Operator::QuestionPipe => Err(py_type_err(format!("Unsupported expr: ${op}"))), }, Expr::Cast(Cast { expr: _, data_type }) => DataTypeMap::map_from_arrow_type(data_type), Expr::Literal(scalar_value) => DataTypeMap::map_from_scalar_value(scalar_value), diff --git a/src/functions.rs b/src/functions.rs index 9c406b95a..476c2b80e 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -375,27 +375,6 @@ macro_rules! aggregate_function { }; } -macro_rules! aggregate_function_vec_args { - ($NAME: ident) => { - aggregate_function_vec_args!($NAME, expr); - }; - ($NAME: ident, $($arg:ident)*) => { - #[pyfunction] - #[pyo3(signature = ($($arg),*, distinct=None, filter=None, order_by=None, null_treatment=None))] - fn $NAME( - $($arg: PyExpr),*, - distinct: Option, - filter: Option, - order_by: Option>, - null_treatment: Option - ) -> PyDataFusionResult { - let agg_fn = functions_aggregate::expr_fn::$NAME(vec![$($arg.into()),*]); - - add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) - } - }; -} - /// Generates a [pyo3] wrapper for [datafusion::functions::expr_fn] /// /// These functions have explicit named arguments. @@ -698,8 +677,22 @@ pub fn approx_percentile_cont_with_weight( add_builder_fns_to_aggregate(agg_fn, None, filter, None, None) } -aggregate_function_vec_args!(last_value); +// We handle first_value explicitly because the signature expects an order_by +// https://github.com/apache/datafusion/issues/12376 +#[pyfunction] +#[pyo3(signature = (expr, distinct=None, filter=None, order_by=None, null_treatment=None))] +pub fn last_value( + expr: PyExpr, + distinct: Option, + filter: Option, + order_by: Option>, + null_treatment: Option, +) -> PyDataFusionResult { + // If we initialize the UDAF with order_by directly, then it gets over-written by the builder + let agg_fn = functions_aggregate::expr_fn::last_value(expr.expr, None); + add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) +} // We handle first_value explicitly because the signature expects an order_by // https://github.com/apache/datafusion/issues/12376 #[pyfunction] From c9f15547cb8019068bbf2dc8eaf148d6eb42bd48 Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Fri, 25 Apr 2025 21:01:54 +0800 Subject: [PATCH 123/248] feat: alias with metadata (#1111) * feat: alias with metadata * fmt --- python/datafusion/expr.py | 14 +++++++++++--- python/datafusion/functions.py | 15 ++++++++++++--- python/tests/test_expr.py | 5 +++++ python/tests/test_functions.py | 5 +++++ src/expr.rs | 6 ++++-- src/functions.rs | 9 +++++++-- 6 files changed, 44 insertions(+), 10 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 2697d8143..01e1f3ded 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -406,9 +406,17 @@ def column(value: str) -> Expr: """Creates a new expression representing a column.""" return Expr(expr_internal.RawExpr.column(value)) - def alias(self, name: str) -> Expr: - """Assign a name to the expression.""" - return Expr(self.expr.alias(name)) + def alias(self, name: str, metadata: Optional[dict[str, str]] = None) -> Expr: + """Assign a name to the expression. + + Args: + name: The name to assign to the expression. + metadata: Optional metadata to attach to the expression. + + Returns: + A new expression with the assigned name. + """ + return Expr(self.expr.alias(name, metadata)) def sort(self, ascending: bool = True, nulls_first: bool = True) -> SortExpr: """Creates a sort :py:class:`Expr` from an existing :py:class:`Expr`. diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 5cf914e16..f430cdf4b 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -372,9 +372,18 @@ def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> So return SortExpr(expr, ascending=ascending, nulls_first=nulls_first) -def alias(expr: Expr, name: str) -> Expr: - """Creates an alias expression.""" - return Expr(f.alias(expr.expr, name)) +def alias(expr: Expr, name: str, metadata: Optional[dict[str, str]] = None) -> Expr: + """Creates an alias expression with an optional metadata dictionary. + + Args: + expr: The expression to alias + name: The alias name + metadata: Optional metadata to attach to the column + + Returns: + An expression with the given alias + """ + return Expr(f.alias(expr.expr, name, metadata)) def col(name: str) -> Expr: diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 926e69845..dcf75f021 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -247,3 +247,8 @@ def test_fill_null(df): assert result.column(0) == pa.array([1, 2, 100]) assert result.column(1) == pa.array([4, 25, 6]) assert result.column(2) == pa.array([1234, 1234, 8]) + + +def test_alias_with_metadata(df): + df = df.select(col("a").alias("b", {"key": "value"})) + assert df.schema().field("b").metadata == {b"key": b"value"} diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 37f2075f5..90cf01f7e 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1231,3 +1231,8 @@ def test_between_default(df): actual = df.collect()[0].to_pydict() assert actual == expected + + +def test_alias_with_metadata(df): + df = df.select(f.alias(f.col("a"), "b", {"key": "value"})) + assert df.schema().field("b").metadata == {b"key": b"value"} diff --git a/src/expr.rs b/src/expr.rs index fe0e76daa..7d4aa8798 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -22,6 +22,7 @@ use datafusion::logical_expr::{ }; use pyo3::IntoPyObjectExt; use pyo3::{basic::CompareOp, prelude::*}; +use std::collections::HashMap; use std::convert::{From, Into}; use std::sync::Arc; use window::PyWindowFrame; @@ -275,8 +276,9 @@ impl PyExpr { } /// assign a name to the PyExpr - pub fn alias(&self, name: &str) -> PyExpr { - self.expr.clone().alias(name).into() + #[pyo3(signature = (name, metadata=None))] + pub fn alias(&self, name: &str, metadata: Option>) -> PyExpr { + self.expr.clone().alias_with_metadata(name, metadata).into() } /// Create a sort PyExpr from an existing PyExpr. diff --git a/src/functions.rs b/src/functions.rs index 476c2b80e..caa79b8ad 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashMap; + use datafusion::functions_aggregate::all_default_aggregate_functions; use datafusion::functions_window::all_default_window_functions; use datafusion::logical_expr::expr::WindowFunctionParams; @@ -205,10 +207,13 @@ fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult /// Creates a new Alias Expr #[pyfunction] -fn alias(expr: PyExpr, name: &str) -> PyResult { +#[pyo3(signature = (expr, name, metadata=None))] +fn alias(expr: PyExpr, name: &str, metadata: Option>) -> PyResult { let relation: Option = None; Ok(PyExpr { - expr: datafusion::logical_expr::Expr::Alias(Alias::new(expr.expr, relation, name)), + expr: datafusion::logical_expr::Expr::Alias( + Alias::new(expr.expr, relation, name).with_metadata(metadata), + ), }) } From 91b66351fb19d91b62e8db83444141743b106e43 Mon Sep 17 00:00:00 2001 From: kosiew Date: Sun, 27 Apr 2025 21:41:01 +0800 Subject: [PATCH 124/248] Add DataFrame usage guide with HTML rendering customization options (#1108) * docs: enhance user guide with detailed DataFrame operations and examples * move /docs/source/api/dataframe.rst into user-guide * docs: remove DataFrame API documentation * docs: fix formatting inconsistencies in DataFrame user guide * Two minor corrections to documentation rendering --------- Co-authored-by: Tim Saucer --- docs/source/index.rst | 1 + docs/source/user-guide/basics.rst | 5 +- docs/source/user-guide/dataframe.rst | 179 +++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 docs/source/user-guide/dataframe.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 558b2d572..c18793822 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -72,6 +72,7 @@ Example user-guide/introduction user-guide/basics user-guide/data-sources + user-guide/dataframe user-guide/common-operations/index user-guide/io/index user-guide/configuration diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index 6636c0c6a..2975d9a6b 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -21,7 +21,8 @@ Concepts ======== In this section, we will cover a basic example to introduce a few key concepts. We will use the -2021 Yellow Taxi Trip Records ([download](https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet)), from the [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). +2021 Yellow Taxi Trip Records (`download `_), +from the `TLC Trip Record Data `_. .. ipython:: python @@ -72,6 +73,8 @@ DataFrames are typically created by calling a method on :py:class:`~datafusion.c calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`, and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition. +For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe`. + Expressions ----------- diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst new file mode 100644 index 000000000..a78fd8073 --- /dev/null +++ b/docs/source/user-guide/dataframe.rst @@ -0,0 +1,179 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +DataFrames +========== + +Overview +-------- + +DataFusion's DataFrame API provides a powerful interface for building and executing queries against data sources. +It offers a familiar API similar to pandas and other DataFrame libraries, but with the performance benefits of Rust +and Arrow. + +A DataFrame represents a logical plan that can be composed through operations like filtering, projection, and aggregation. +The actual execution happens when terminal operations like ``collect()`` or ``show()`` are called. + +Basic Usage +----------- + +.. code-block:: python + + import datafusion + from datafusion import col, lit + + # Create a context and register a data source + ctx = datafusion.SessionContext() + ctx.register_csv("my_table", "path/to/data.csv") + + # Create and manipulate a DataFrame + df = ctx.sql("SELECT * FROM my_table") + + # Or use the DataFrame API directly + df = (ctx.table("my_table") + .filter(col("age") > lit(25)) + .select([col("name"), col("age")])) + + # Execute and collect results + result = df.collect() + + # Display the first few rows + df.show() + +HTML Rendering +-------------- + +When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will +automatically display as formatted HTML tables, making it easier to visualize your data. + +The ``_repr_html_`` method is called automatically by Jupyter to render a DataFrame. This method +controls how DataFrames appear in notebook environments, providing a richer visualization than +plain text output. + +Customizing HTML Rendering +-------------------------- + +You can customize how DataFrames are rendered in HTML by configuring the formatter: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter + + # Change the default styling + configure_formatter( + max_rows=50, # Maximum number of rows to display + max_width=None, # Maximum width in pixels (None for auto) + theme="light", # Theme: "light" or "dark" + precision=2, # Floating point precision + thousands_separator=",", # Separator for thousands + date_format="%Y-%m-%d", # Date format + truncate_width=20 # Max width for string columns before truncating + ) + +The formatter settings affect all DataFrames displayed after configuration. + +Custom Style Providers +---------------------- + +For advanced styling needs, you can create a custom style provider: + +.. code-block:: python + + from datafusion.html_formatter import StyleProvider, configure_formatter + + class MyStyleProvider(StyleProvider): + def get_table_styles(self): + return { + "table": "border-collapse: collapse; width: 100%;", + "th": "background-color: #007bff; color: white; padding: 8px; text-align: left;", + "td": "border: 1px solid #ddd; padding: 8px;", + "tr:nth-child(even)": "background-color: #f2f2f2;", + } + + def get_value_styles(self, dtype, value): + """Return custom styles for specific values""" + if dtype == "float" and value < 0: + return "color: red;" + return None + + # Apply the custom style provider + configure_formatter(style_provider=MyStyleProvider()) + +Creating a Custom Formatter +--------------------------- + +For complete control over rendering, you can implement a custom formatter: + +.. code-block:: python + + from datafusion.html_formatter import Formatter, get_formatter + + class MyFormatter(Formatter): + def format_html(self, batches, schema, has_more=False, table_uuid=None): + # Create your custom HTML here + html = "
" + # ... formatting logic ... + html += "
" + return html + + # Set as the global formatter + configure_formatter(formatter_class=MyFormatter) + + # Or use the formatter just for specific operations + formatter = get_formatter() + custom_html = formatter.format_html(batches, schema) + +Managing Formatters +------------------- + +Reset to default formatting: + +.. code-block:: python + + from datafusion.html_formatter import reset_formatter + + # Reset to default settings + reset_formatter() + +Get the current formatter settings: + +.. code-block:: python + + from datafusion.html_formatter import get_formatter + + formatter = get_formatter() + print(formatter.max_rows) + print(formatter.theme) + +Contextual Formatting +--------------------- + +You can also use a context manager to temporarily change formatting settings: + +.. code-block:: python + + from datafusion.html_formatter import formatting_context + + # Default formatting + df.show() + + # Temporarily use different formatting + with formatting_context(max_rows=100, theme="dark"): + df.show() # Will use the temporary settings + + # Back to default formatting + df.show() From 00dea113eb85d54b758eb3451ea448c7b9263c1c Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Sun, 27 Apr 2025 10:14:54 -0400 Subject: [PATCH 125/248] Improve col class access using __getattr__ Co-authored-by: Tim Saucer --- python/datafusion/__init__.py | 12 ++-------- python/datafusion/col.py | 45 +++++++++++++++++++++++++++++++++++ python/tests/test_expr.py | 23 ++++++++++++++++++ 3 files changed, 70 insertions(+), 10 deletions(-) create mode 100644 python/datafusion/col.py diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 60d0d61b4..15ceefbdb 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -26,6 +26,8 @@ except ImportError: import importlib_metadata +from datafusion.col import col, column + from . import functions, object_store, substrait, unparser # The following imports are okay to remain as opaque to the user. @@ -95,16 +97,6 @@ ] -def column(value: str) -> Expr: - """Create a column expression.""" - return Expr.column(value) - - -def col(value: str) -> Expr: - """Create a column expression.""" - return Expr.column(value) - - def literal(value) -> Expr: """Create a literal expression.""" return Expr.literal(value) diff --git a/python/datafusion/col.py b/python/datafusion/col.py new file mode 100644 index 000000000..1141dc092 --- /dev/null +++ b/python/datafusion/col.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Col class.""" + +from datafusion.expr import Expr + + +class Col: + """Create a column expression. + + This helper class allows an extra syntax of creating columns using the __getattr__ + method. + """ + + def __call__(self, value: str) -> Expr: + """Create a column expression.""" + return Expr.column(value) + + def __getattr__(self, value: str) -> Expr: + """Create a column using attribute syntax.""" + # For autocomplete to work with IPython + if value.startswith("__wrapped__"): + return getattr(type(self), value) + + return Expr.column(value) + + +col: Col = Col() +column: Col = Col() +__all__ = ["col", "column"] diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index dcf75f021..3651b60d6 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -249,6 +249,29 @@ def test_fill_null(df): assert result.column(2) == pa.array([1234, 1234, 8]) +def test_col_getattr(): + ctx = SessionContext() + data = { + "array_values": [[1, 2, 3], [4, 5], [6], []], + "struct_values": [ + {"name": "Alice", "age": 15}, + {"name": "Bob", "age": 14}, + {"name": "Charlie", "age": 13}, + {"name": None, "age": 12}, + ], + } + df = ctx.from_pydict(data, name="table1") + + names = df.select(col.struct_values["name"].alias("name")).collect() + names = [r.as_py() for rs in names for r in rs["name"]] + + array_values = df.select(col.array_values[1].alias("value")).collect() + array_values = [r.as_py() for rs in array_values for r in rs["value"]] + + assert names == ["Alice", "Bob", "Charlie", None] + assert array_values == [2, 5, None, None] + + def test_alias_with_metadata(df): df = df.select(col("a").alias("b", {"key": "value"})) assert df.schema().field("b").metadata == {b"key": b"value"} From 5a7f638286d2397bbce87e0e8197bebb46f26649 Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Sun, 27 Apr 2025 10:17:41 -0400 Subject: [PATCH 126/248] Add expression chaining of single parameter scalar functions --- python/datafusion/expr.py | 289 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 289 insertions(+) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 01e1f3ded..84e9d4ebb 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -24,6 +24,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Optional +import functions as F import pyarrow as pa try: @@ -611,6 +612,294 @@ def over(self, window: Window) -> Expr: ) ) + def asin(self) -> Expr: + """Returns the arc sine or inverse sine of a number.""" + return F.asin(self) + + def array_pop_back(self) -> Expr: + """Returns the array without the last element.""" + return F.array_pop_back(self) + + def reverse(self) -> Expr: + """Reverse the string argument.""" + return F.reverse(self) + + def bit_length(self) -> Expr: + """Returns the number of bits in the string argument.""" + return F.bit_length(self) + + def array_length(self) -> Expr: + """Returns the length of the array.""" + return F.array_length(self) + + def array_ndims(self) -> Expr: + """Returns the number of dimensions of the array.""" + return F.array_ndims(self) + + def to_hex(self) -> Expr: + """Converts an integer to a hexadecimal string.""" + return F.to_hex(self) + + def array_dims(self) -> Expr: + """Returns an array of the array's dimensions.""" + return F.array_dims(self) + + def from_unixtime(self) -> Expr: + """Converts an integer to RFC3339 timestamp format string.""" + return F.from_unixtime(self) + + def array_empty(self) -> Expr: + """Returns a boolean indicating whether the array is empty.""" + return F.array_empty(self) + + def sin(self) -> Expr: + """Returns the sine of the argument.""" + return F.sin(self) + + def log10(self) -> Expr: + """Base 10 logarithm of the argument.""" + return F.log10(self) + + def initcap(self) -> Expr: + """Set the initial letter of each word to capital. + + Converts the first letter of each word in ``string`` to uppercase and the remaining + characters to lowercase. + """ + return F.initcap(self) + + def list_distinct(self) -> Expr: + """Returns distinct values from the array after removing duplicates. + + This is an alias for :py:func:`array_distinct`. + """ + return F.list_distinct(self) + + def iszero(self) -> Expr: + """Returns true if a given number is +0.0 or -0.0 otherwise returns false.""" + return F.iszero(self) + + def array_distinct(self) -> Expr: + """Returns distinct values from the array after removing duplicates.""" + return F.array_distinct(self) + + def arrow_typeof(self) -> Expr: + """Returns the Arrow type of the expression.""" + return F.arrow_typeof(self) + + def length(self) -> Expr: + """The number of characters in the ``string``.""" + return F.length(self) + + def lower(self) -> Expr: + """Converts a string to lowercase.""" + return F.lower(self) + + def acos(self) -> Expr: + """Returns the arc cosine or inverse cosine of a number. + + Returns: + -------- + Expr + A new expression representing the arc cosine of the input expression. + """ + return F.acos(self) + + def ascii(self) -> Expr: + """Returns the numeric code of the first character of the argument.""" + return F.ascii(self) + + def sha384(self) -> Expr: + """Computes the SHA-384 hash of a binary string.""" + return F.sha384(self) + + def isnan(self) -> Expr: + """Returns true if a given number is +NaN or -NaN otherwise returns false.""" + return F.isnan(self) + + def degrees(self) -> Expr: + """Converts the argument from radians to degrees.""" + return F.degrees(self) + + def cardinality(self) -> Expr: + """Returns the total number of elements in the array.""" + return F.cardinality(self) + + def sha224(self) -> Expr: + """Computes the SHA-224 hash of a binary string.""" + return F.sha224(self) + + def asinh(self) -> Expr: + """Returns inverse hyperbolic sine.""" + return F.asinh(self) + + def flatten(self) -> Expr: + """Flattens an array of arrays into a single array.""" + return F.flatten(self) + + def exp(self) -> Expr: + """Returns the exponential of the argument.""" + return F.exp(self) + + def abs(self) -> Expr: + """Return the absolute value of a given number. + + Returns: + -------- + Expr + A new expression representing the absolute value of the input expression. + """ + return F.abs(self) + + def btrim(self) -> Expr: + """Removes all characters, spaces by default, from both sides of a string.""" + return F.btrim(self) + + def md5(self) -> Expr: + """Computes an MD5 128-bit checksum for a string expression.""" + return F.md5(self) + + def octet_length(self) -> Expr: + """Returns the number of bytes of a string.""" + return F.octet_length(self) + + def cosh(self) -> Expr: + """Returns the hyperbolic cosine of the argument.""" + return F.cosh(self) + + def radians(self) -> Expr: + """Converts the argument from degrees to radians.""" + return F.radians(self) + + def sqrt(self) -> Expr: + """Returns the square root of the argument.""" + return F.sqrt(self) + + def character_length(self) -> Expr: + """Returns the number of characters in the argument.""" + return F.character_length(self) + + def tanh(self) -> Expr: + """Returns the hyperbolic tangent of the argument.""" + return F.tanh(self) + + def atan(self) -> Expr: + """Returns inverse tangent of a number.""" + return F.atan(self) + + def rtrim(self) -> Expr: + """Removes all characters, spaces by default, from the end of a string.""" + return F.rtrim(self) + + def atanh(self) -> Expr: + """Returns inverse hyperbolic tangent.""" + return F.atanh(self) + + def list_dims(self) -> Expr: + """Returns an array of the array's dimensions. + + This is an alias for :py:func:`array_dims`. + """ + return F.list_dims(self) + + def sha256(self) -> Expr: + """Computes the SHA-256 hash of a binary string.""" + return F.sha256(self) + + def factorial(self) -> Expr: + """Returns the factorial of the argument.""" + return F.factorial(self) + + def acosh(self) -> Expr: + """Returns inverse hyperbolic cosine.""" + return F.acosh(self) + + def floor(self) -> Expr: + """Returns the nearest integer less than or equal to the argument.""" + return F.floor(self) + + def ceil(self) -> Expr: + """Returns the nearest integer greater than or equal to argument.""" + return F.ceil(self) + + def list_length(self) -> Expr: + """Returns the length of the array. + + This is an alias for :py:func:`array_length`. + """ + return F.list_length(self) + + def upper(self) -> Expr: + """Converts a string to uppercase.""" + return F.upper(self) + + def chr(self) -> Expr: + """Converts the Unicode code point to a UTF8 character.""" + return F.chr(self) + + def ln(self) -> Expr: + """Returns the natural logarithm (base e) of the argument.""" + return F.ln(self) + + def tan(self) -> Expr: + """Returns the tangent of the argument.""" + return F.tan(self) + + def array_pop_front(self) -> Expr: + """Returns the array without the first element.""" + return F.array_pop_front(self) + + def cbrt(self) -> Expr: + """Returns the cube root of a number.""" + return F.cbrt(self) + + def sha512(self) -> Expr: + """Computes the SHA-512 hash of a binary string.""" + return F.sha512(self) + + def char_length(self) -> Expr: + """The number of characters in the ``string``.""" + return F.char_length(self) + + def list_ndims(self) -> Expr: + """Returns the number of dimensions of the array. + + This is an alias for :py:func:`array_ndims`. + """ + return F.list_ndims(self) + + def trim(self) -> Expr: + """Removes all characters, spaces by default, from both sides of a string.""" + return F.trim(self) + + def cos(self) -> Expr: + """Returns the cosine of the argument.""" + return F.cos(self) + + def sinh(self) -> Expr: + """Returns the hyperbolic sine of the argument.""" + return F.sinh(self) + + def empty(self) -> Expr: + """This is an alias for :py:func:`array_empty`.""" + return F.empty(self) + + def ltrim(self) -> Expr: + """Removes all characters, spaces by default, from the beginning of a string.""" + return F.ltrim(self) + + def signum(self) -> Expr: + """Returns the sign of the argument (-1, 0, +1).""" + return F.signum(self) + + def log2(self) -> Expr: + """Base 2 logarithm of the argument.""" + return F.log2(self) + + def cot(self) -> Expr: + """Returns the cotangent of the argument.""" + return F.cot(self) + class ExprFuncBuilder: def __init__(self, builder: expr_internal.ExprFuncBuilder) -> None: From 10600fb8fc32eba43b0b0f198325b55c63f8223d Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Mon, 28 Apr 2025 21:25:59 +0800 Subject: [PATCH 127/248] fix: recursive import (#1117) * fix: recursive import * format * format --- python/datafusion/expr.py | 135 +++++++++++++++++++++++++++++++++++++- 1 file changed, 132 insertions(+), 3 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 84e9d4ebb..3750eeb3f 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -24,7 +24,6 @@ from typing import TYPE_CHECKING, Any, ClassVar, Optional -import functions as F import pyarrow as pa try: @@ -614,58 +613,84 @@ def over(self, window: Window) -> Expr: def asin(self) -> Expr: """Returns the arc sine or inverse sine of a number.""" + from . import functions as F + return F.asin(self) def array_pop_back(self) -> Expr: """Returns the array without the last element.""" + from . import functions as F + return F.array_pop_back(self) def reverse(self) -> Expr: """Reverse the string argument.""" + from . import functions as F + return F.reverse(self) def bit_length(self) -> Expr: """Returns the number of bits in the string argument.""" + from . import functions as F + return F.bit_length(self) def array_length(self) -> Expr: """Returns the length of the array.""" + from . import functions as F + return F.array_length(self) def array_ndims(self) -> Expr: """Returns the number of dimensions of the array.""" + from . import functions as F + return F.array_ndims(self) def to_hex(self) -> Expr: """Converts an integer to a hexadecimal string.""" + from . import functions as F + return F.to_hex(self) def array_dims(self) -> Expr: """Returns an array of the array's dimensions.""" + from . import functions as F + return F.array_dims(self) def from_unixtime(self) -> Expr: """Converts an integer to RFC3339 timestamp format string.""" + from . import functions as F + return F.from_unixtime(self) def array_empty(self) -> Expr: """Returns a boolean indicating whether the array is empty.""" + from . import functions as F + return F.array_empty(self) def sin(self) -> Expr: """Returns the sine of the argument.""" + from . import functions as F + return F.sin(self) def log10(self) -> Expr: """Base 10 logarithm of the argument.""" + from . import functions as F + return F.log10(self) def initcap(self) -> Expr: """Set the initial letter of each word to capital. - Converts the first letter of each word in ``string`` to uppercase and the remaining - characters to lowercase. + Converts the first letter of each word in ``string`` + to uppercase and the remaining characters to lowercase. """ + from . import functions as F + return F.initcap(self) def list_distinct(self) -> Expr: @@ -673,26 +698,38 @@ def list_distinct(self) -> Expr: This is an alias for :py:func:`array_distinct`. """ + from . import functions as F + return F.list_distinct(self) def iszero(self) -> Expr: """Returns true if a given number is +0.0 or -0.0 otherwise returns false.""" + from . import functions as F + return F.iszero(self) def array_distinct(self) -> Expr: """Returns distinct values from the array after removing duplicates.""" + from . import functions as F + return F.array_distinct(self) def arrow_typeof(self) -> Expr: """Returns the Arrow type of the expression.""" + from . import functions as F + return F.arrow_typeof(self) def length(self) -> Expr: """The number of characters in the ``string``.""" + from . import functions as F + return F.length(self) def lower(self) -> Expr: """Converts a string to lowercase.""" + from . import functions as F + return F.lower(self) def acos(self) -> Expr: @@ -703,42 +740,62 @@ def acos(self) -> Expr: Expr A new expression representing the arc cosine of the input expression. """ + from . import functions as F + return F.acos(self) def ascii(self) -> Expr: """Returns the numeric code of the first character of the argument.""" + from . import functions as F + return F.ascii(self) def sha384(self) -> Expr: """Computes the SHA-384 hash of a binary string.""" + from . import functions as F + return F.sha384(self) def isnan(self) -> Expr: """Returns true if a given number is +NaN or -NaN otherwise returns false.""" + from . import functions as F + return F.isnan(self) def degrees(self) -> Expr: """Converts the argument from radians to degrees.""" + from . import functions as F + return F.degrees(self) def cardinality(self) -> Expr: """Returns the total number of elements in the array.""" + from . import functions as F + return F.cardinality(self) def sha224(self) -> Expr: """Computes the SHA-224 hash of a binary string.""" + from . import functions as F + return F.sha224(self) def asinh(self) -> Expr: """Returns inverse hyperbolic sine.""" + from . import functions as F + return F.asinh(self) def flatten(self) -> Expr: """Flattens an array of arrays into a single array.""" + from . import functions as F + return F.flatten(self) def exp(self) -> Expr: """Returns the exponential of the argument.""" + from . import functions as F + return F.exp(self) def abs(self) -> Expr: @@ -749,50 +806,74 @@ def abs(self) -> Expr: Expr A new expression representing the absolute value of the input expression. """ + from . import functions as F + return F.abs(self) def btrim(self) -> Expr: """Removes all characters, spaces by default, from both sides of a string.""" + from . import functions as F + return F.btrim(self) def md5(self) -> Expr: """Computes an MD5 128-bit checksum for a string expression.""" + from . import functions as F + return F.md5(self) def octet_length(self) -> Expr: """Returns the number of bytes of a string.""" + from . import functions as F + return F.octet_length(self) def cosh(self) -> Expr: """Returns the hyperbolic cosine of the argument.""" + from . import functions as F + return F.cosh(self) def radians(self) -> Expr: """Converts the argument from degrees to radians.""" + from . import functions as F + return F.radians(self) def sqrt(self) -> Expr: """Returns the square root of the argument.""" + from . import functions as F + return F.sqrt(self) def character_length(self) -> Expr: """Returns the number of characters in the argument.""" + from . import functions as F + return F.character_length(self) def tanh(self) -> Expr: """Returns the hyperbolic tangent of the argument.""" + from . import functions as F + return F.tanh(self) def atan(self) -> Expr: """Returns inverse tangent of a number.""" + from . import functions as F + return F.atan(self) def rtrim(self) -> Expr: """Removes all characters, spaces by default, from the end of a string.""" + from . import functions as F + return F.rtrim(self) def atanh(self) -> Expr: """Returns inverse hyperbolic tangent.""" + from . import functions as F + return F.atanh(self) def list_dims(self) -> Expr: @@ -800,26 +881,38 @@ def list_dims(self) -> Expr: This is an alias for :py:func:`array_dims`. """ + from . import functions as F + return F.list_dims(self) def sha256(self) -> Expr: """Computes the SHA-256 hash of a binary string.""" + from . import functions as F + return F.sha256(self) def factorial(self) -> Expr: """Returns the factorial of the argument.""" + from . import functions as F + return F.factorial(self) def acosh(self) -> Expr: """Returns inverse hyperbolic cosine.""" + from . import functions as F + return F.acosh(self) def floor(self) -> Expr: """Returns the nearest integer less than or equal to the argument.""" + from . import functions as F + return F.floor(self) def ceil(self) -> Expr: """Returns the nearest integer greater than or equal to argument.""" + from . import functions as F + return F.ceil(self) def list_length(self) -> Expr: @@ -827,38 +920,56 @@ def list_length(self) -> Expr: This is an alias for :py:func:`array_length`. """ + from . import functions as F + return F.list_length(self) def upper(self) -> Expr: """Converts a string to uppercase.""" + from . import functions as F + return F.upper(self) def chr(self) -> Expr: """Converts the Unicode code point to a UTF8 character.""" + from . import functions as F + return F.chr(self) def ln(self) -> Expr: """Returns the natural logarithm (base e) of the argument.""" + from . import functions as F + return F.ln(self) def tan(self) -> Expr: """Returns the tangent of the argument.""" + from . import functions as F + return F.tan(self) def array_pop_front(self) -> Expr: """Returns the array without the first element.""" + from . import functions as F + return F.array_pop_front(self) def cbrt(self) -> Expr: """Returns the cube root of a number.""" + from . import functions as F + return F.cbrt(self) def sha512(self) -> Expr: """Computes the SHA-512 hash of a binary string.""" + from . import functions as F + return F.sha512(self) def char_length(self) -> Expr: """The number of characters in the ``string``.""" + from . import functions as F + return F.char_length(self) def list_ndims(self) -> Expr: @@ -866,38 +977,56 @@ def list_ndims(self) -> Expr: This is an alias for :py:func:`array_ndims`. """ + from . import functions as F + return F.list_ndims(self) def trim(self) -> Expr: """Removes all characters, spaces by default, from both sides of a string.""" + from . import functions as F + return F.trim(self) def cos(self) -> Expr: """Returns the cosine of the argument.""" + from . import functions as F + return F.cos(self) def sinh(self) -> Expr: """Returns the hyperbolic sine of the argument.""" + from . import functions as F + return F.sinh(self) def empty(self) -> Expr: """This is an alias for :py:func:`array_empty`.""" + from . import functions as F + return F.empty(self) def ltrim(self) -> Expr: """Removes all characters, spaces by default, from the beginning of a string.""" + from . import functions as F + return F.ltrim(self) def signum(self) -> Expr: """Returns the sign of the argument (-1, 0, +1).""" + from . import functions as F + return F.signum(self) def log2(self) -> Expr: """Base 2 logarithm of the argument.""" + from . import functions as F + return F.log2(self) def cot(self) -> Expr: """Returns the cotangent of the argument.""" + from . import functions as F + return F.cot(self) From 6fbeceff6091aee610273d9b27106483f9ce24ea Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 1 May 2025 12:10:40 -0400 Subject: [PATCH 128/248] Copy over protected branch rule from datafusion repo (#1122) --- .asf.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.asf.yaml b/.asf.yaml index e96b43cf0..75b2262de 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -29,6 +29,10 @@ github: rebase: false features: issues: true + protected_branches: + main: + required_pull_request_reviews: + required_approving_review_count: 1 staging: whoami: asf-staging From 15b96c48eb76ad8ea19022df427aa25b06c3012b Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Mon, 5 May 2025 21:43:03 +0800 Subject: [PATCH 129/248] feat: add missing PyLogicalPlan to_variant (#1085) * add expr * format * clippy * add license * update * ruff * Update expr.py * add test * ruff * Minor ruff whitespace change * Minor format change --------- Co-authored-by: Tim Saucer --- python/datafusion/common.py | 6 + python/datafusion/expr.py | 54 +++- python/tests/test_expr.py | 86 ++++++ src/common.rs | 3 + src/common/schema.rs | 89 ++++++ src/expr.rs | 41 +++ src/expr/copy_to.rs | 138 +++++++++ src/expr/create_catalog.rs | 100 +++++++ src/expr/create_catalog_schema.rs | 100 +++++++ src/expr/create_external_table.rs | 183 ++++++++++++ src/expr/create_function.rs | 182 ++++++++++++ src/expr/create_index.rs | 129 +++++++++ src/expr/describe_table.rs | 92 ++++++ src/expr/dml.rs | 136 +++++++++ src/expr/drop_catalog_schema.rs | 116 ++++++++ src/expr/drop_function.rs | 95 +++++++ src/expr/drop_view.rs | 102 +++++++ src/expr/recursive_query.rs | 111 ++++++++ src/expr/statement.rs | 454 ++++++++++++++++++++++++++++++ src/expr/values.rs | 86 ++++++ src/sql/logical.rs | 85 +++++- 21 files changed, 2372 insertions(+), 16 deletions(-) create mode 100644 src/expr/copy_to.rs create mode 100644 src/expr/create_catalog.rs create mode 100644 src/expr/create_catalog_schema.rs create mode 100644 src/expr/create_external_table.rs create mode 100644 src/expr/create_function.rs create mode 100644 src/expr/create_index.rs create mode 100644 src/expr/describe_table.rs create mode 100644 src/expr/dml.rs create mode 100644 src/expr/drop_catalog_schema.rs create mode 100644 src/expr/drop_function.rs create mode 100644 src/expr/drop_view.rs create mode 100644 src/expr/recursive_query.rs create mode 100644 src/expr/statement.rs create mode 100644 src/expr/values.rs diff --git a/python/datafusion/common.py b/python/datafusion/common.py index e762a993b..c689a816d 100644 --- a/python/datafusion/common.py +++ b/python/datafusion/common.py @@ -33,8 +33,12 @@ SqlTable = common_internal.SqlTable SqlType = common_internal.SqlType SqlView = common_internal.SqlView +TableType = common_internal.TableType +TableSource = common_internal.TableSource +Constraints = common_internal.Constraints __all__ = [ + "Constraints", "DFSchema", "DataType", "DataTypeMap", @@ -47,6 +51,8 @@ "SqlTable", "SqlType", "SqlView", + "TableSource", + "TableType", ] diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 3750eeb3f..9e58873d0 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -54,14 +54,29 @@ Case = expr_internal.Case Cast = expr_internal.Cast Column = expr_internal.Column +CopyTo = expr_internal.CopyTo +CreateCatalog = expr_internal.CreateCatalog +CreateCatalogSchema = expr_internal.CreateCatalogSchema +CreateExternalTable = expr_internal.CreateExternalTable +CreateFunction = expr_internal.CreateFunction +CreateFunctionBody = expr_internal.CreateFunctionBody +CreateIndex = expr_internal.CreateIndex CreateMemoryTable = expr_internal.CreateMemoryTable CreateView = expr_internal.CreateView +Deallocate = expr_internal.Deallocate +DescribeTable = expr_internal.DescribeTable Distinct = expr_internal.Distinct +DmlStatement = expr_internal.DmlStatement +DropCatalogSchema = expr_internal.DropCatalogSchema +DropFunction = expr_internal.DropFunction DropTable = expr_internal.DropTable +DropView = expr_internal.DropView EmptyRelation = expr_internal.EmptyRelation +Execute = expr_internal.Execute Exists = expr_internal.Exists Explain = expr_internal.Explain Extension = expr_internal.Extension +FileType = expr_internal.FileType Filter = expr_internal.Filter GroupingSet = expr_internal.GroupingSet Join = expr_internal.Join @@ -83,21 +98,31 @@ Literal = expr_internal.Literal Negative = expr_internal.Negative Not = expr_internal.Not +OperateFunctionArg = expr_internal.OperateFunctionArg Partitioning = expr_internal.Partitioning Placeholder = expr_internal.Placeholder +Prepare = expr_internal.Prepare Projection = expr_internal.Projection +RecursiveQuery = expr_internal.RecursiveQuery Repartition = expr_internal.Repartition ScalarSubquery = expr_internal.ScalarSubquery ScalarVariable = expr_internal.ScalarVariable +SetVariable = expr_internal.SetVariable SimilarTo = expr_internal.SimilarTo Sort = expr_internal.Sort Subquery = expr_internal.Subquery SubqueryAlias = expr_internal.SubqueryAlias TableScan = expr_internal.TableScan +TransactionAccessMode = expr_internal.TransactionAccessMode +TransactionConclusion = expr_internal.TransactionConclusion +TransactionEnd = expr_internal.TransactionEnd +TransactionIsolationLevel = expr_internal.TransactionIsolationLevel +TransactionStart = expr_internal.TransactionStart TryCast = expr_internal.TryCast Union = expr_internal.Union Unnest = expr_internal.Unnest UnnestExpr = expr_internal.UnnestExpr +Values = expr_internal.Values WindowExpr = expr_internal.WindowExpr __all__ = [ @@ -111,15 +136,30 @@ "CaseBuilder", "Cast", "Column", + "CopyTo", + "CreateCatalog", + "CreateCatalogSchema", + "CreateExternalTable", + "CreateFunction", + "CreateFunctionBody", + "CreateIndex", "CreateMemoryTable", "CreateView", + "Deallocate", + "DescribeTable", "Distinct", + "DmlStatement", + "DropCatalogSchema", + "DropFunction", "DropTable", + "DropView", "EmptyRelation", + "Execute", "Exists", "Explain", "Expr", "Extension", + "FileType", "Filter", "GroupingSet", "ILike", @@ -142,22 +182,32 @@ "Literal", "Negative", "Not", + "OperateFunctionArg", "Partitioning", "Placeholder", + "Prepare", "Projection", + "RecursiveQuery", "Repartition", "ScalarSubquery", "ScalarVariable", + "SetVariable", "SimilarTo", "Sort", "SortExpr", "Subquery", "SubqueryAlias", "TableScan", + "TransactionAccessMode", + "TransactionConclusion", + "TransactionEnd", + "TransactionIsolationLevel", + "TransactionStart", "TryCast", "Union", "Unnest", "UnnestExpr", + "Values", "Window", "WindowExpr", "WindowFrame", @@ -686,8 +736,8 @@ def log10(self) -> Expr: def initcap(self) -> Expr: """Set the initial letter of each word to capital. - Converts the first letter of each word in ``string`` - to uppercase and the remaining characters to lowercase. + Converts the first letter of each word in ``string`` to uppercase and the + remaining characters to lowercase. """ from . import functions as F diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 3651b60d6..58a202724 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -23,12 +23,21 @@ AggregateFunction, BinaryExpr, Column, + CopyTo, + CreateIndex, + DescribeTable, + DmlStatement, + DropCatalogSchema, Filter, Limit, Literal, Projection, + RecursiveQuery, Sort, TableScan, + TransactionEnd, + TransactionStart, + Values, ) @@ -249,6 +258,83 @@ def test_fill_null(df): assert result.column(2) == pa.array([1234, 1234, 8]) +def test_copy_to(): + ctx = SessionContext() + ctx.sql("CREATE TABLE foo (a int, b int)").collect() + df = ctx.sql("COPY foo TO bar STORED AS CSV") + plan = df.logical_plan() + plan = plan.to_variant() + assert isinstance(plan, CopyTo) + + +def test_create_index(): + ctx = SessionContext() + ctx.sql("CREATE TABLE foo (a int, b int)").collect() + plan = ctx.sql("create index idx on foo (a)").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, CreateIndex) + + +def test_describe_table(): + ctx = SessionContext() + ctx.sql("CREATE TABLE foo (a int, b int)").collect() + plan = ctx.sql("describe foo").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, DescribeTable) + + +def test_dml_statement(): + ctx = SessionContext() + ctx.sql("CREATE TABLE foo (a int, b int)").collect() + plan = ctx.sql("insert into foo values (1, 2)").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, DmlStatement) + + +def drop_catalog_schema(): + ctx = SessionContext() + plan = ctx.sql("drop schema cat").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, DropCatalogSchema) + + +def test_recursive_query(): + ctx = SessionContext() + plan = ctx.sql( + """ + WITH RECURSIVE cte AS ( + SELECT 1 as n + UNION ALL + SELECT n + 1 FROM cte WHERE n < 5 + ) + SELECT * FROM cte; + """ + ).logical_plan() + plan = plan.inputs()[0].inputs()[0].to_variant() + assert isinstance(plan, RecursiveQuery) + + +def test_values(): + ctx = SessionContext() + plan = ctx.sql("values (1, 'foo'), (2, 'bar')").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, Values) + + +def test_transaction_start(): + ctx = SessionContext() + plan = ctx.sql("START TRANSACTION").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, TransactionStart) + + +def test_transaction_end(): + ctx = SessionContext() + plan = ctx.sql("COMMIT").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, TransactionEnd) + + def test_col_getattr(): ctx = SessionContext() data = { diff --git a/src/common.rs b/src/common.rs index 453bf67a4..88d2fdd5f 100644 --- a/src/common.rs +++ b/src/common.rs @@ -36,5 +36,8 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/common/schema.rs b/src/common/schema.rs index 66ce925ae..5a54fe333 100644 --- a/src/common/schema.rs +++ b/src/common/schema.rs @@ -15,14 +15,22 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; use std::{any::Any, borrow::Cow}; +use arrow::datatypes::Schema; +use arrow::pyarrow::PyArrowType; use datafusion::arrow::datatypes::SchemaRef; +use datafusion::common::Constraints; +use datafusion::datasource::TableType; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableSource}; use pyo3::prelude::*; use datafusion::logical_expr::utils::split_conjunction; +use crate::sql::logical::PyLogicalPlan; + use super::{data_type::DataTypeMap, function::SqlFunction}; #[pyclass(name = "SqlSchema", module = "datafusion.common", subclass)] @@ -218,3 +226,84 @@ impl SqlStatistics { self.row_count } } + +#[pyclass(name = "Constraints", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyConstraints { + pub constraints: Constraints, +} + +impl From for Constraints { + fn from(constraints: PyConstraints) -> Self { + constraints.constraints + } +} + +impl From for PyConstraints { + fn from(constraints: Constraints) -> Self { + PyConstraints { constraints } + } +} + +impl Display for PyConstraints { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "Constraints: {:?}", self.constraints) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "TableType", module = "datafusion.common")] +pub enum PyTableType { + Base, + View, + Temporary, +} + +impl From for datafusion::logical_expr::TableType { + fn from(table_type: PyTableType) -> Self { + match table_type { + PyTableType::Base => datafusion::logical_expr::TableType::Base, + PyTableType::View => datafusion::logical_expr::TableType::View, + PyTableType::Temporary => datafusion::logical_expr::TableType::Temporary, + } + } +} + +impl From for PyTableType { + fn from(table_type: TableType) -> Self { + match table_type { + datafusion::logical_expr::TableType::Base => PyTableType::Base, + datafusion::logical_expr::TableType::View => PyTableType::View, + datafusion::logical_expr::TableType::Temporary => PyTableType::Temporary, + } + } +} + +#[pyclass(name = "TableSource", module = "datafusion.common", subclass)] +#[derive(Clone)] +pub struct PyTableSource { + pub table_source: Arc, +} + +#[pymethods] +impl PyTableSource { + pub fn schema(&self) -> PyArrowType { + (*self.table_source.schema()).clone().into() + } + + pub fn constraints(&self) -> Option { + self.table_source.constraints().map(|c| PyConstraints { + constraints: c.clone(), + }) + } + + pub fn table_type(&self) -> PyTableType { + self.table_source.table_type().into() + } + + pub fn get_logical_plan(&self) -> Option { + self.table_source + .get_logical_plan() + .map(|plan| PyLogicalPlan::new(plan.into_owned())) + } +} diff --git a/src/expr.rs b/src/expr.rs index 7d4aa8798..404e575f8 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -67,10 +67,21 @@ pub mod case; pub mod cast; pub mod column; pub mod conditional_expr; +pub mod copy_to; +pub mod create_catalog; +pub mod create_catalog_schema; +pub mod create_external_table; +pub mod create_function; +pub mod create_index; pub mod create_memory_table; pub mod create_view; +pub mod describe_table; pub mod distinct; +pub mod dml; +pub mod drop_catalog_schema; +pub mod drop_function; pub mod drop_table; +pub mod drop_view; pub mod empty_relation; pub mod exists; pub mod explain; @@ -86,18 +97,21 @@ pub mod literal; pub mod logical_node; pub mod placeholder; pub mod projection; +pub mod recursive_query; pub mod repartition; pub mod scalar_subquery; pub mod scalar_variable; pub mod signature; pub mod sort; pub mod sort_expr; +pub mod statement; pub mod subquery; pub mod subquery_alias; pub mod table_scan; pub mod union; pub mod unnest; pub mod unnest_expr; +pub mod values; pub mod window; use sort_expr::{to_sort_expressions, PySortExpr}; @@ -802,5 +816,32 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) } diff --git a/src/expr/copy_to.rs b/src/expr/copy_to.rs new file mode 100644 index 000000000..ebfcb8ebc --- /dev/null +++ b/src/expr/copy_to.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + collections::HashMap, + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::{common::file_options::file_type::FileType, logical_expr::dml::CopyTo}; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::sql::logical::PyLogicalPlan; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "CopyTo", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCopyTo { + copy: CopyTo, +} + +impl From for CopyTo { + fn from(copy: PyCopyTo) -> Self { + copy.copy + } +} + +impl From for PyCopyTo { + fn from(copy: CopyTo) -> PyCopyTo { + PyCopyTo { copy } + } +} + +impl Display for PyCopyTo { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CopyTo: {:?}", self.copy.output_url) + } +} + +impl LogicalNode for PyCopyTo { + fn inputs(&self) -> Vec { + vec![PyLogicalPlan::from((*self.copy.input).clone())] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyCopyTo { + #[new] + pub fn new( + input: PyLogicalPlan, + output_url: String, + partition_by: Vec, + file_type: PyFileType, + options: HashMap, + ) -> Self { + PyCopyTo { + copy: CopyTo { + input: input.plan(), + output_url, + partition_by, + file_type: file_type.file_type, + options, + }, + } + } + + fn input(&self) -> PyLogicalPlan { + PyLogicalPlan::from((*self.copy.input).clone()) + } + + fn output_url(&self) -> String { + self.copy.output_url.clone() + } + + fn partition_by(&self) -> Vec { + self.copy.partition_by.clone() + } + + fn file_type(&self) -> PyFileType { + PyFileType { + file_type: self.copy.file_type.clone(), + } + } + + fn options(&self) -> HashMap { + self.copy.options.clone() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CopyTo({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CopyTo".to_string()) + } +} + +#[pyclass(name = "FileType", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyFileType { + file_type: Arc, +} + +impl Display for PyFileType { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "FileType: {}", self.file_type) + } +} + +#[pymethods] +impl PyFileType { + fn __repr__(&self) -> PyResult { + Ok(format!("FileType({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("FileType".to_string()) + } +} diff --git a/src/expr/create_catalog.rs b/src/expr/create_catalog.rs new file mode 100644 index 000000000..f4ea0f517 --- /dev/null +++ b/src/expr/create_catalog.rs @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::CreateCatalog; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "CreateCatalog", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateCatalog { + create: CreateCatalog, +} + +impl From for CreateCatalog { + fn from(create: PyCreateCatalog) -> Self { + create.create + } +} + +impl From for PyCreateCatalog { + fn from(create: CreateCatalog) -> PyCreateCatalog { + PyCreateCatalog { create } + } +} + +impl Display for PyCreateCatalog { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CreateCatalog: {:?}", self.create.catalog_name) + } +} + +#[pymethods] +impl PyCreateCatalog { + #[new] + pub fn new( + catalog_name: String, + if_not_exists: bool, + schema: PyDFSchema, + ) -> PyResult { + Ok(PyCreateCatalog { + create: CreateCatalog { + catalog_name, + if_not_exists, + schema: Arc::new(schema.into()), + }, + }) + } + + pub fn catalog_name(&self) -> String { + self.create.catalog_name.clone() + } + + pub fn if_not_exists(&self) -> bool { + self.create.if_not_exists + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateCatalog({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateCatalog".to_string()) + } +} + +impl LogicalNode for PyCreateCatalog { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/create_catalog_schema.rs b/src/expr/create_catalog_schema.rs new file mode 100644 index 000000000..85f447e1e --- /dev/null +++ b/src/expr/create_catalog_schema.rs @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::CreateCatalogSchema; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "CreateCatalogSchema", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateCatalogSchema { + create: CreateCatalogSchema, +} + +impl From for CreateCatalogSchema { + fn from(create: PyCreateCatalogSchema) -> Self { + create.create + } +} + +impl From for PyCreateCatalogSchema { + fn from(create: CreateCatalogSchema) -> PyCreateCatalogSchema { + PyCreateCatalogSchema { create } + } +} + +impl Display for PyCreateCatalogSchema { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CreateCatalogSchema: {:?}", self.create.schema_name) + } +} + +#[pymethods] +impl PyCreateCatalogSchema { + #[new] + pub fn new( + schema_name: String, + if_not_exists: bool, + schema: PyDFSchema, + ) -> PyResult { + Ok(PyCreateCatalogSchema { + create: CreateCatalogSchema { + schema_name, + if_not_exists, + schema: Arc::new(schema.into()), + }, + }) + } + + pub fn schema_name(&self) -> String { + self.create.schema_name.clone() + } + + pub fn if_not_exists(&self) -> bool { + self.create.if_not_exists + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateCatalogSchema({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateCatalogSchema".to_string()) + } +} + +impl LogicalNode for PyCreateCatalogSchema { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/create_external_table.rs b/src/expr/create_external_table.rs new file mode 100644 index 000000000..01ce7d0ca --- /dev/null +++ b/src/expr/create_external_table.rs @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{common::schema::PyConstraints, expr::PyExpr, sql::logical::PyLogicalPlan}; +use std::{ + collections::HashMap, + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::CreateExternalTable; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::common::df_schema::PyDFSchema; + +use super::{logical_node::LogicalNode, sort_expr::PySortExpr}; + +#[pyclass(name = "CreateExternalTable", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateExternalTable { + create: CreateExternalTable, +} + +impl From for CreateExternalTable { + fn from(create: PyCreateExternalTable) -> Self { + create.create + } +} + +impl From for PyCreateExternalTable { + fn from(create: CreateExternalTable) -> PyCreateExternalTable { + PyCreateExternalTable { create } + } +} + +impl Display for PyCreateExternalTable { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "CreateExternalTable: {:?}{}", + self.create.name, self.create.constraints + ) + } +} + +#[pymethods] +impl PyCreateExternalTable { + #[allow(clippy::too_many_arguments)] + #[new] + #[pyo3(signature = (schema, name, location, file_type, table_partition_cols, if_not_exists, temporary, order_exprs, unbounded, options, constraints, column_defaults, definition=None))] + pub fn new( + schema: PyDFSchema, + name: String, + location: String, + file_type: String, + table_partition_cols: Vec, + if_not_exists: bool, + temporary: bool, + order_exprs: Vec>, + unbounded: bool, + options: HashMap, + constraints: PyConstraints, + column_defaults: HashMap, + definition: Option, + ) -> Self { + let create = CreateExternalTable { + schema: Arc::new(schema.into()), + name: name.into(), + location, + file_type, + table_partition_cols, + if_not_exists, + temporary, + definition, + order_exprs: order_exprs + .into_iter() + .map(|vec| vec.into_iter().map(|s| s.into()).collect::>()) + .collect::>(), + unbounded, + options, + constraints: constraints.constraints, + column_defaults: column_defaults + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + }; + PyCreateExternalTable { create } + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + pub fn name(&self) -> PyResult { + Ok(self.create.name.to_string()) + } + + pub fn location(&self) -> String { + self.create.location.clone() + } + + pub fn file_type(&self) -> String { + self.create.file_type.clone() + } + + pub fn table_partition_cols(&self) -> Vec { + self.create.table_partition_cols.clone() + } + + pub fn if_not_exists(&self) -> bool { + self.create.if_not_exists + } + + pub fn temporary(&self) -> bool { + self.create.temporary + } + + pub fn definition(&self) -> Option { + self.create.definition.clone() + } + + pub fn order_exprs(&self) -> Vec> { + self.create + .order_exprs + .iter() + .map(|vec| vec.iter().map(|s| s.clone().into()).collect()) + .collect() + } + + pub fn unbounded(&self) -> bool { + self.create.unbounded + } + + pub fn options(&self) -> HashMap { + self.create.options.clone() + } + + pub fn constraints(&self) -> PyConstraints { + PyConstraints { + constraints: self.create.constraints.clone(), + } + } + + pub fn column_defaults(&self) -> HashMap { + self.create + .column_defaults + .iter() + .map(|(k, v)| (k.clone(), v.clone().into())) + .collect() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateExternalTable({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateExternalTable".to_string()) + } +} + +impl LogicalNode for PyCreateExternalTable { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/create_function.rs b/src/expr/create_function.rs new file mode 100644 index 000000000..6f3c3f0ff --- /dev/null +++ b/src/expr/create_function.rs @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::{ + CreateFunction, CreateFunctionBody, OperateFunctionArg, Volatility, +}; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use super::logical_node::LogicalNode; +use super::PyExpr; +use crate::common::{data_type::PyDataType, df_schema::PyDFSchema}; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "CreateFunction", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateFunction { + create: CreateFunction, +} + +impl From for CreateFunction { + fn from(create: PyCreateFunction) -> Self { + create.create + } +} + +impl From for PyCreateFunction { + fn from(create: CreateFunction) -> PyCreateFunction { + PyCreateFunction { create } + } +} + +impl Display for PyCreateFunction { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CreateFunction: name {:?}", self.create.name) + } +} + +#[pyclass(name = "OperateFunctionArg", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyOperateFunctionArg { + arg: OperateFunctionArg, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "Volatility", module = "datafusion.expr")] +pub enum PyVolatility { + Immutable, + Stable, + Volatile, +} + +#[pyclass(name = "CreateFunctionBody", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateFunctionBody { + body: CreateFunctionBody, +} + +#[pymethods] +impl PyCreateFunctionBody { + pub fn language(&self) -> Option { + self.body + .language + .as_ref() + .map(|language| language.to_string()) + } + + pub fn behavior(&self) -> Option { + self.body.behavior.as_ref().map(|behavior| match behavior { + Volatility::Immutable => PyVolatility::Immutable, + Volatility::Stable => PyVolatility::Stable, + Volatility::Volatile => PyVolatility::Volatile, + }) + } + + pub fn function_body(&self) -> Option { + self.body + .function_body + .as_ref() + .map(|function_body| function_body.clone().into()) + } +} + +#[pymethods] +impl PyCreateFunction { + #[new] + #[pyo3(signature = (or_replace, temporary, name, params, schema, return_type=None, args=None))] + pub fn new( + or_replace: bool, + temporary: bool, + name: String, + params: PyCreateFunctionBody, + schema: PyDFSchema, + return_type: Option, + args: Option>, + ) -> Self { + PyCreateFunction { + create: CreateFunction { + or_replace, + temporary, + name, + args: args.map(|args| args.into_iter().map(|arg| arg.arg).collect()), + return_type: return_type.map(|return_type| return_type.data_type), + params: params.body, + schema: Arc::new(schema.into()), + }, + } + } + + pub fn or_replace(&self) -> bool { + self.create.or_replace + } + + pub fn temporary(&self) -> bool { + self.create.temporary + } + + pub fn name(&self) -> String { + self.create.name.clone() + } + + pub fn params(&self) -> PyCreateFunctionBody { + PyCreateFunctionBody { + body: self.create.params.clone(), + } + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + pub fn return_type(&self) -> Option { + self.create + .return_type + .as_ref() + .map(|return_type| return_type.clone().into()) + } + + pub fn args(&self) -> Option> { + self.create.args.as_ref().map(|args| { + args.iter() + .map(|arg| PyOperateFunctionArg { arg: arg.clone() }) + .collect() + }) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateFunction({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateFunction".to_string()) + } +} + +impl LogicalNode for PyCreateFunction { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/create_index.rs b/src/expr/create_index.rs new file mode 100644 index 000000000..13dadbc3f --- /dev/null +++ b/src/expr/create_index.rs @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::CreateIndex; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::{logical_node::LogicalNode, sort_expr::PySortExpr}; + +#[pyclass(name = "CreateIndex", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateIndex { + create: CreateIndex, +} + +impl From for CreateIndex { + fn from(create: PyCreateIndex) -> Self { + create.create + } +} + +impl From for PyCreateIndex { + fn from(create: CreateIndex) -> PyCreateIndex { + PyCreateIndex { create } + } +} + +impl Display for PyCreateIndex { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CreateIndex: {:?}", self.create.name) + } +} + +#[pymethods] +impl PyCreateIndex { + #[new] + #[pyo3(signature = (table, columns, unique, if_not_exists, schema, name=None, using=None))] + pub fn new( + table: String, + columns: Vec, + unique: bool, + if_not_exists: bool, + schema: PyDFSchema, + name: Option, + using: Option, + ) -> PyResult { + Ok(PyCreateIndex { + create: CreateIndex { + name, + table: table.into(), + using, + columns: columns.iter().map(|c| c.clone().into()).collect(), + unique, + if_not_exists, + schema: Arc::new(schema.into()), + }, + }) + } + + pub fn name(&self) -> Option { + self.create.name.clone() + } + + pub fn table(&self) -> PyResult { + Ok(self.create.table.to_string()) + } + + pub fn using(&self) -> Option { + self.create.using.clone() + } + + pub fn columns(&self) -> Vec { + self.create + .columns + .iter() + .map(|c| c.clone().into()) + .collect() + } + + pub fn unique(&self) -> bool { + self.create.unique + } + + pub fn if_not_exists(&self) -> bool { + self.create.if_not_exists + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateIndex({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateIndex".to_string()) + } +} + +impl LogicalNode for PyCreateIndex { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/describe_table.rs b/src/expr/describe_table.rs new file mode 100644 index 000000000..5658a13f2 --- /dev/null +++ b/src/expr/describe_table.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use arrow::{datatypes::Schema, pyarrow::PyArrowType}; +use datafusion::logical_expr::DescribeTable; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "DescribeTable", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDescribeTable { + describe: DescribeTable, +} + +impl Display for PyDescribeTable { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "DescribeTable") + } +} + +#[pymethods] +impl PyDescribeTable { + #[new] + fn new(schema: PyArrowType, output_schema: PyDFSchema) -> Self { + Self { + describe: DescribeTable { + schema: Arc::new(schema.0), + output_schema: Arc::new(output_schema.into()), + }, + } + } + + pub fn schema(&self) -> PyArrowType { + (*self.describe.schema).clone().into() + } + + pub fn output_schema(&self) -> PyDFSchema { + (*self.describe.output_schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("DescribeTable({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("DescribeTable".to_string()) + } +} + +impl From for DescribeTable { + fn from(describe: PyDescribeTable) -> Self { + describe.describe + } +} + +impl From for PyDescribeTable { + fn from(describe: DescribeTable) -> PyDescribeTable { + PyDescribeTable { describe } + } +} + +impl LogicalNode for PyDescribeTable { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/dml.rs b/src/expr/dml.rs new file mode 100644 index 000000000..251e336cc --- /dev/null +++ b/src/expr/dml.rs @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::logical_expr::dml::InsertOp; +use datafusion::logical_expr::{DmlStatement, WriteOp}; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::common::schema::PyTableSource; +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "DmlStatement", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDmlStatement { + dml: DmlStatement, +} + +impl From for DmlStatement { + fn from(dml: PyDmlStatement) -> Self { + dml.dml + } +} + +impl From for PyDmlStatement { + fn from(dml: DmlStatement) -> PyDmlStatement { + PyDmlStatement { dml } + } +} + +impl LogicalNode for PyDmlStatement { + fn inputs(&self) -> Vec { + vec![PyLogicalPlan::from((*self.dml.input).clone())] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyDmlStatement { + pub fn table_name(&self) -> PyResult { + Ok(self.dml.table_name.to_string()) + } + + pub fn target(&self) -> PyResult { + Ok(PyTableSource { + table_source: self.dml.target.clone(), + }) + } + + pub fn op(&self) -> PyWriteOp { + self.dml.op.clone().into() + } + + pub fn input(&self) -> PyLogicalPlan { + PyLogicalPlan { + plan: self.dml.input.clone(), + } + } + + pub fn output_schema(&self) -> PyDFSchema { + (*self.dml.output_schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok("DmlStatement".to_string()) + } + + fn __name__(&self) -> PyResult { + Ok("DmlStatement".to_string()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "WriteOp", module = "datafusion.expr")] +pub enum PyWriteOp { + Append, + Overwrite, + Replace, + + Update, + Delete, + Ctas, +} + +impl From for PyWriteOp { + fn from(write_op: WriteOp) -> Self { + match write_op { + WriteOp::Insert(InsertOp::Append) => PyWriteOp::Append, + WriteOp::Insert(InsertOp::Overwrite) => PyWriteOp::Overwrite, + WriteOp::Insert(InsertOp::Replace) => PyWriteOp::Replace, + + WriteOp::Update => PyWriteOp::Update, + WriteOp::Delete => PyWriteOp::Delete, + WriteOp::Ctas => PyWriteOp::Ctas, + } + } +} + +impl From for WriteOp { + fn from(py: PyWriteOp) -> Self { + match py { + PyWriteOp::Append => WriteOp::Insert(InsertOp::Append), + PyWriteOp::Overwrite => WriteOp::Insert(InsertOp::Overwrite), + PyWriteOp::Replace => WriteOp::Insert(InsertOp::Replace), + + PyWriteOp::Update => WriteOp::Update, + PyWriteOp::Delete => WriteOp::Delete, + PyWriteOp::Ctas => WriteOp::Ctas, + } + } +} + +#[pymethods] +impl PyWriteOp { + fn name(&self) -> String { + let write_op: WriteOp = self.clone().into(); + write_op.name().to_string() + } +} diff --git a/src/expr/drop_catalog_schema.rs b/src/expr/drop_catalog_schema.rs new file mode 100644 index 000000000..b7420a99c --- /dev/null +++ b/src/expr/drop_catalog_schema.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::{common::SchemaReference, logical_expr::DropCatalogSchema, sql::TableReference}; +use pyo3::{exceptions::PyValueError, prelude::*, IntoPyObjectExt}; + +use crate::common::df_schema::PyDFSchema; + +use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "DropCatalogSchema", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDropCatalogSchema { + drop: DropCatalogSchema, +} + +impl From for DropCatalogSchema { + fn from(drop: PyDropCatalogSchema) -> Self { + drop.drop + } +} + +impl From for PyDropCatalogSchema { + fn from(drop: DropCatalogSchema) -> PyDropCatalogSchema { + PyDropCatalogSchema { drop } + } +} + +impl Display for PyDropCatalogSchema { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "DropCatalogSchema") + } +} + +fn parse_schema_reference(name: String) -> PyResult { + match name.into() { + TableReference::Bare { table } => Ok(SchemaReference::Bare { schema: table }), + TableReference::Partial { schema, table } => Ok(SchemaReference::Full { + schema: table, + catalog: schema, + }), + TableReference::Full { + catalog: _, + schema: _, + table: _, + } => Err(PyErr::new::( + "Invalid schema specifier (has 3 parts)".to_string(), + )), + } +} + +#[pymethods] +impl PyDropCatalogSchema { + #[new] + fn new(name: String, schema: PyDFSchema, if_exists: bool, cascade: bool) -> PyResult { + let name = parse_schema_reference(name)?; + Ok(PyDropCatalogSchema { + drop: DropCatalogSchema { + name, + schema: Arc::new(schema.into()), + if_exists, + cascade, + }, + }) + } + + fn name(&self) -> PyResult { + Ok(self.drop.name.to_string()) + } + + fn schema(&self) -> PyDFSchema { + (*self.drop.schema).clone().into() + } + + fn if_exists(&self) -> PyResult { + Ok(self.drop.if_exists) + } + + fn cascade(&self) -> PyResult { + Ok(self.drop.cascade) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("DropCatalogSchema({})", self)) + } +} + +impl LogicalNode for PyDropCatalogSchema { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/drop_function.rs b/src/expr/drop_function.rs new file mode 100644 index 000000000..9fbd78fdc --- /dev/null +++ b/src/expr/drop_function.rs @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::DropFunction; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use super::logical_node::LogicalNode; +use crate::common::df_schema::PyDFSchema; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "DropFunction", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDropFunction { + drop: DropFunction, +} + +impl From for DropFunction { + fn from(drop: PyDropFunction) -> Self { + drop.drop + } +} + +impl From for PyDropFunction { + fn from(drop: DropFunction) -> PyDropFunction { + PyDropFunction { drop } + } +} + +impl Display for PyDropFunction { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "DropFunction") + } +} + +#[pymethods] +impl PyDropFunction { + #[new] + fn new(name: String, schema: PyDFSchema, if_exists: bool) -> PyResult { + Ok(PyDropFunction { + drop: DropFunction { + name, + schema: Arc::new(schema.into()), + if_exists, + }, + }) + } + fn name(&self) -> PyResult { + Ok(self.drop.name.clone()) + } + + fn schema(&self) -> PyDFSchema { + (*self.drop.schema).clone().into() + } + + fn if_exists(&self) -> PyResult { + Ok(self.drop.if_exists) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("DropFunction({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("DropFunction".to_string()) + } +} + +impl LogicalNode for PyDropFunction { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/drop_view.rs b/src/expr/drop_view.rs new file mode 100644 index 000000000..1d1ab1e59 --- /dev/null +++ b/src/expr/drop_view.rs @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::DropView; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::common::df_schema::PyDFSchema; + +use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "DropView", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDropView { + drop: DropView, +} + +impl From for DropView { + fn from(drop: PyDropView) -> Self { + drop.drop + } +} + +impl From for PyDropView { + fn from(drop: DropView) -> PyDropView { + PyDropView { drop } + } +} + +impl Display for PyDropView { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "DropView: {name:?} if not exist:={if_exists}", + name = self.drop.name, + if_exists = self.drop.if_exists + ) + } +} + +#[pymethods] +impl PyDropView { + #[new] + fn new(name: String, schema: PyDFSchema, if_exists: bool) -> PyResult { + Ok(PyDropView { + drop: DropView { + name: name.into(), + schema: Arc::new(schema.into()), + if_exists, + }, + }) + } + + fn name(&self) -> PyResult { + Ok(self.drop.name.to_string()) + } + + fn schema(&self) -> PyDFSchema { + (*self.drop.schema).clone().into() + } + + fn if_exists(&self) -> PyResult { + Ok(self.drop.if_exists) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("DropView({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("DropView".to_string()) + } +} + +impl LogicalNode for PyDropView { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/recursive_query.rs b/src/expr/recursive_query.rs new file mode 100644 index 000000000..65181f7d3 --- /dev/null +++ b/src/expr/recursive_query.rs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fmt::{self, Display, Formatter}; + +use datafusion::logical_expr::RecursiveQuery; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::sql::logical::PyLogicalPlan; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "RecursiveQuery", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyRecursiveQuery { + query: RecursiveQuery, +} + +impl From for RecursiveQuery { + fn from(query: PyRecursiveQuery) -> Self { + query.query + } +} + +impl From for PyRecursiveQuery { + fn from(query: RecursiveQuery) -> PyRecursiveQuery { + PyRecursiveQuery { query } + } +} + +impl Display for PyRecursiveQuery { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "RecursiveQuery {name:?} is_distinct:={is_distinct}", + name = self.query.name, + is_distinct = self.query.is_distinct + ) + } +} + +#[pymethods] +impl PyRecursiveQuery { + #[new] + fn new( + name: String, + static_term: PyLogicalPlan, + recursive_term: PyLogicalPlan, + is_distinct: bool, + ) -> Self { + Self { + query: RecursiveQuery { + name, + static_term: static_term.plan(), + recursive_term: recursive_term.plan(), + is_distinct, + }, + } + } + + fn name(&self) -> PyResult { + Ok(self.query.name.clone()) + } + + fn static_term(&self) -> PyLogicalPlan { + PyLogicalPlan::from((*self.query.static_term).clone()) + } + + fn recursive_term(&self) -> PyLogicalPlan { + PyLogicalPlan::from((*self.query.recursive_term).clone()) + } + + fn is_distinct(&self) -> PyResult { + Ok(self.query.is_distinct) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("RecursiveQuery({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("RecursiveQuery".to_string()) + } +} + +impl LogicalNode for PyRecursiveQuery { + fn inputs(&self) -> Vec { + vec![ + PyLogicalPlan::from((*self.query.static_term).clone()), + PyLogicalPlan::from((*self.query.recursive_term).clone()), + ] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/statement.rs b/src/expr/statement.rs new file mode 100644 index 000000000..83774cda1 --- /dev/null +++ b/src/expr/statement.rs @@ -0,0 +1,454 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::logical_expr::{ + Deallocate, Execute, Prepare, SetVariable, TransactionAccessMode, TransactionConclusion, + TransactionEnd, TransactionIsolationLevel, TransactionStart, +}; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::data_type::PyDataType, sql::logical::PyLogicalPlan}; + +use super::{logical_node::LogicalNode, PyExpr}; + +#[pyclass(name = "TransactionStart", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyTransactionStart { + transaction_start: TransactionStart, +} + +impl From for PyTransactionStart { + fn from(transaction_start: TransactionStart) -> PyTransactionStart { + PyTransactionStart { transaction_start } + } +} + +impl TryFrom for TransactionStart { + type Error = PyErr; + + fn try_from(py: PyTransactionStart) -> Result { + Ok(py.transaction_start) + } +} + +impl LogicalNode for PyTransactionStart { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "TransactionAccessMode", module = "datafusion.expr")] +pub enum PyTransactionAccessMode { + ReadOnly, + ReadWrite, +} + +impl From for PyTransactionAccessMode { + fn from(access_mode: TransactionAccessMode) -> PyTransactionAccessMode { + match access_mode { + TransactionAccessMode::ReadOnly => PyTransactionAccessMode::ReadOnly, + TransactionAccessMode::ReadWrite => PyTransactionAccessMode::ReadWrite, + } + } +} + +impl TryFrom for TransactionAccessMode { + type Error = PyErr; + + fn try_from(py: PyTransactionAccessMode) -> Result { + match py { + PyTransactionAccessMode::ReadOnly => Ok(TransactionAccessMode::ReadOnly), + PyTransactionAccessMode::ReadWrite => Ok(TransactionAccessMode::ReadWrite), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass( + eq, + eq_int, + name = "TransactionIsolationLevel", + module = "datafusion.expr" +)] +pub enum PyTransactionIsolationLevel { + ReadUncommitted, + ReadCommitted, + RepeatableRead, + Serializable, + Snapshot, +} + +impl From for PyTransactionIsolationLevel { + fn from(isolation_level: TransactionIsolationLevel) -> PyTransactionIsolationLevel { + match isolation_level { + TransactionIsolationLevel::ReadUncommitted => { + PyTransactionIsolationLevel::ReadUncommitted + } + TransactionIsolationLevel::ReadCommitted => PyTransactionIsolationLevel::ReadCommitted, + TransactionIsolationLevel::RepeatableRead => { + PyTransactionIsolationLevel::RepeatableRead + } + TransactionIsolationLevel::Serializable => PyTransactionIsolationLevel::Serializable, + TransactionIsolationLevel::Snapshot => PyTransactionIsolationLevel::Snapshot, + } + } +} + +impl TryFrom for TransactionIsolationLevel { + type Error = PyErr; + + fn try_from(value: PyTransactionIsolationLevel) -> Result { + match value { + PyTransactionIsolationLevel::ReadUncommitted => { + Ok(TransactionIsolationLevel::ReadUncommitted) + } + PyTransactionIsolationLevel::ReadCommitted => { + Ok(TransactionIsolationLevel::ReadCommitted) + } + PyTransactionIsolationLevel::RepeatableRead => { + Ok(TransactionIsolationLevel::RepeatableRead) + } + PyTransactionIsolationLevel::Serializable => { + Ok(TransactionIsolationLevel::Serializable) + } + PyTransactionIsolationLevel::Snapshot => Ok(TransactionIsolationLevel::Snapshot), + } + } +} + +#[pymethods] +impl PyTransactionStart { + #[new] + pub fn new( + access_mode: PyTransactionAccessMode, + isolation_level: PyTransactionIsolationLevel, + ) -> PyResult { + let access_mode = access_mode.try_into()?; + let isolation_level = isolation_level.try_into()?; + Ok(PyTransactionStart { + transaction_start: TransactionStart { + access_mode, + isolation_level, + }, + }) + } + + pub fn access_mode(&self) -> PyResult { + Ok(self.transaction_start.access_mode.clone().into()) + } + + pub fn isolation_level(&self) -> PyResult { + Ok(self.transaction_start.isolation_level.clone().into()) + } +} + +#[pyclass(name = "TransactionEnd", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyTransactionEnd { + transaction_end: TransactionEnd, +} + +impl From for PyTransactionEnd { + fn from(transaction_end: TransactionEnd) -> PyTransactionEnd { + PyTransactionEnd { transaction_end } + } +} + +impl TryFrom for TransactionEnd { + type Error = PyErr; + + fn try_from(py: PyTransactionEnd) -> Result { + Ok(py.transaction_end) + } +} + +impl LogicalNode for PyTransactionEnd { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "TransactionConclusion", module = "datafusion.expr")] +pub enum PyTransactionConclusion { + Commit, + Rollback, +} + +impl From for PyTransactionConclusion { + fn from(value: TransactionConclusion) -> Self { + match value { + TransactionConclusion::Commit => PyTransactionConclusion::Commit, + TransactionConclusion::Rollback => PyTransactionConclusion::Rollback, + } + } +} + +impl TryFrom for TransactionConclusion { + type Error = PyErr; + + fn try_from(value: PyTransactionConclusion) -> Result { + match value { + PyTransactionConclusion::Commit => Ok(TransactionConclusion::Commit), + PyTransactionConclusion::Rollback => Ok(TransactionConclusion::Rollback), + } + } +} +#[pymethods] +impl PyTransactionEnd { + #[new] + pub fn new(conclusion: PyTransactionConclusion, chain: bool) -> PyResult { + let conclusion = conclusion.try_into()?; + Ok(PyTransactionEnd { + transaction_end: TransactionEnd { conclusion, chain }, + }) + } + + pub fn conclusion(&self) -> PyResult { + Ok(self.transaction_end.conclusion.clone().into()) + } + + pub fn chain(&self) -> bool { + self.transaction_end.chain + } +} + +#[pyclass(name = "SetVariable", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PySetVariable { + set_variable: SetVariable, +} + +impl From for PySetVariable { + fn from(set_variable: SetVariable) -> PySetVariable { + PySetVariable { set_variable } + } +} + +impl TryFrom for SetVariable { + type Error = PyErr; + + fn try_from(py: PySetVariable) -> Result { + Ok(py.set_variable) + } +} + +impl LogicalNode for PySetVariable { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PySetVariable { + #[new] + pub fn new(variable: String, value: String) -> Self { + PySetVariable { + set_variable: SetVariable { variable, value }, + } + } + + pub fn variable(&self) -> String { + self.set_variable.variable.clone() + } + + pub fn value(&self) -> String { + self.set_variable.value.clone() + } +} + +#[pyclass(name = "Prepare", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyPrepare { + prepare: Prepare, +} + +impl From for PyPrepare { + fn from(prepare: Prepare) -> PyPrepare { + PyPrepare { prepare } + } +} + +impl TryFrom for Prepare { + type Error = PyErr; + + fn try_from(py: PyPrepare) -> Result { + Ok(py.prepare) + } +} + +impl LogicalNode for PyPrepare { + fn inputs(&self) -> Vec { + vec![PyLogicalPlan::from((*self.prepare.input).clone())] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyPrepare { + #[new] + pub fn new(name: String, data_types: Vec, input: PyLogicalPlan) -> Self { + let input = input.plan().clone(); + let data_types = data_types + .into_iter() + .map(|data_type| data_type.into()) + .collect(); + PyPrepare { + prepare: Prepare { + name, + data_types, + input, + }, + } + } + + pub fn name(&self) -> String { + self.prepare.name.clone() + } + + pub fn data_types(&self) -> Vec { + self.prepare + .data_types + .clone() + .into_iter() + .map(|t| t.into()) + .collect() + } + + pub fn input(&self) -> PyLogicalPlan { + PyLogicalPlan { + plan: self.prepare.input.clone(), + } + } +} + +#[pyclass(name = "Execute", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyExecute { + execute: Execute, +} + +impl From for PyExecute { + fn from(execute: Execute) -> PyExecute { + PyExecute { execute } + } +} + +impl TryFrom for Execute { + type Error = PyErr; + + fn try_from(py: PyExecute) -> Result { + Ok(py.execute) + } +} + +impl LogicalNode for PyExecute { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyExecute { + #[new] + pub fn new(name: String, parameters: Vec) -> Self { + let parameters = parameters + .into_iter() + .map(|parameter| parameter.into()) + .collect(); + PyExecute { + execute: Execute { name, parameters }, + } + } + + pub fn name(&self) -> String { + self.execute.name.clone() + } + + pub fn parameters(&self) -> Vec { + self.execute + .parameters + .clone() + .into_iter() + .map(|t| t.into()) + .collect() + } +} + +#[pyclass(name = "Deallocate", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDeallocate { + deallocate: Deallocate, +} + +impl From for PyDeallocate { + fn from(deallocate: Deallocate) -> PyDeallocate { + PyDeallocate { deallocate } + } +} + +impl TryFrom for Deallocate { + type Error = PyErr; + + fn try_from(py: PyDeallocate) -> Result { + Ok(py.deallocate) + } +} + +impl LogicalNode for PyDeallocate { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyDeallocate { + #[new] + pub fn new(name: String) -> Self { + PyDeallocate { + deallocate: Deallocate { name }, + } + } + + pub fn name(&self) -> String { + self.deallocate.name.clone() + } +} diff --git a/src/expr/values.rs b/src/expr/values.rs new file mode 100644 index 000000000..fb2692230 --- /dev/null +++ b/src/expr/values.rs @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::logical_expr::Values; +use pyo3::{prelude::*, IntoPyObjectExt}; +use pyo3::{pyclass, PyErr, PyResult, Python}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::{logical_node::LogicalNode, PyExpr}; + +#[pyclass(name = "Values", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyValues { + values: Values, +} + +impl From for PyValues { + fn from(values: Values) -> PyValues { + PyValues { values } + } +} + +impl TryFrom for Values { + type Error = PyErr; + + fn try_from(py: PyValues) -> Result { + Ok(py.values) + } +} + +impl LogicalNode for PyValues { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyValues { + #[new] + pub fn new(schema: PyDFSchema, values: Vec>) -> PyResult { + let values = values + .into_iter() + .map(|row| row.into_iter().map(|expr| expr.into()).collect()) + .collect(); + Ok(PyValues { + values: Values { + schema: Arc::new(schema.into()), + values, + }, + }) + } + + pub fn schema(&self) -> PyResult { + Ok((*self.values.schema).clone().into()) + } + + pub fn values(&self) -> Vec> { + self.values + .values + .clone() + .into_iter() + .map(|row| row.into_iter().map(|expr| expr.into()).collect()) + .collect() + } +} diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 96561c434..198d68bdc 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -17,10 +17,25 @@ use std::sync::Arc; +use crate::context::PySessionContext; use crate::errors::PyDataFusionResult; use crate::expr::aggregate::PyAggregate; use crate::expr::analyze::PyAnalyze; +use crate::expr::copy_to::PyCopyTo; +use crate::expr::create_catalog::PyCreateCatalog; +use crate::expr::create_catalog_schema::PyCreateCatalogSchema; +use crate::expr::create_external_table::PyCreateExternalTable; +use crate::expr::create_function::PyCreateFunction; +use crate::expr::create_index::PyCreateIndex; +use crate::expr::create_memory_table::PyCreateMemoryTable; +use crate::expr::create_view::PyCreateView; +use crate::expr::describe_table::PyDescribeTable; use crate::expr::distinct::PyDistinct; +use crate::expr::dml::PyDmlStatement; +use crate::expr::drop_catalog_schema::PyDropCatalogSchema; +use crate::expr::drop_function::PyDropFunction; +use crate::expr::drop_table::PyDropTable; +use crate::expr::drop_view::PyDropView; use crate::expr::empty_relation::PyEmptyRelation; use crate::expr::explain::PyExplain; use crate::expr::extension::PyExtension; @@ -28,14 +43,20 @@ use crate::expr::filter::PyFilter; use crate::expr::join::PyJoin; use crate::expr::limit::PyLimit; use crate::expr::projection::PyProjection; +use crate::expr::recursive_query::PyRecursiveQuery; +use crate::expr::repartition::PyRepartition; use crate::expr::sort::PySort; +use crate::expr::statement::{ + PyDeallocate, PyExecute, PyPrepare, PySetVariable, PyTransactionEnd, PyTransactionStart, +}; use crate::expr::subquery::PySubquery; use crate::expr::subquery_alias::PySubqueryAlias; use crate::expr::table_scan::PyTableScan; +use crate::expr::union::PyUnion; use crate::expr::unnest::PyUnnest; +use crate::expr::values::PyValues; use crate::expr::window::PyWindowExpr; -use crate::{context::PySessionContext, errors::py_unsupported_variant_err}; -use datafusion::logical_expr::LogicalPlan; +use datafusion::logical_expr::{DdlStatement, LogicalPlan, Statement}; use datafusion_proto::logical_plan::{AsLogicalPlan, DefaultLogicalExtensionCodec}; use prost::Message; use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; @@ -82,18 +103,54 @@ impl PyLogicalPlan { LogicalPlan::SubqueryAlias(plan) => PySubqueryAlias::from(plan.clone()).to_variant(py), LogicalPlan::Unnest(plan) => PyUnnest::from(plan.clone()).to_variant(py), LogicalPlan::Window(plan) => PyWindowExpr::from(plan.clone()).to_variant(py), - LogicalPlan::Repartition(_) - | LogicalPlan::Union(_) - | LogicalPlan::Statement(_) - | LogicalPlan::Values(_) - | LogicalPlan::Dml(_) - | LogicalPlan::Ddl(_) - | LogicalPlan::Copy(_) - | LogicalPlan::DescribeTable(_) - | LogicalPlan::RecursiveQuery(_) => Err(py_unsupported_variant_err(format!( - "Conversion of variant not implemented: {:?}", - self.plan - ))), + LogicalPlan::Repartition(plan) => PyRepartition::from(plan.clone()).to_variant(py), + LogicalPlan::Union(plan) => PyUnion::from(plan.clone()).to_variant(py), + LogicalPlan::Statement(plan) => match plan { + Statement::TransactionStart(plan) => { + PyTransactionStart::from(plan.clone()).to_variant(py) + } + Statement::TransactionEnd(plan) => { + PyTransactionEnd::from(plan.clone()).to_variant(py) + } + Statement::SetVariable(plan) => PySetVariable::from(plan.clone()).to_variant(py), + Statement::Prepare(plan) => PyPrepare::from(plan.clone()).to_variant(py), + Statement::Execute(plan) => PyExecute::from(plan.clone()).to_variant(py), + Statement::Deallocate(plan) => PyDeallocate::from(plan.clone()).to_variant(py), + }, + LogicalPlan::Values(plan) => PyValues::from(plan.clone()).to_variant(py), + LogicalPlan::Dml(plan) => PyDmlStatement::from(plan.clone()).to_variant(py), + LogicalPlan::Ddl(plan) => match plan { + DdlStatement::CreateExternalTable(plan) => { + PyCreateExternalTable::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateMemoryTable(plan) => { + PyCreateMemoryTable::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateView(plan) => PyCreateView::from(plan.clone()).to_variant(py), + DdlStatement::CreateCatalogSchema(plan) => { + PyCreateCatalogSchema::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateCatalog(plan) => { + PyCreateCatalog::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateIndex(plan) => PyCreateIndex::from(plan.clone()).to_variant(py), + DdlStatement::DropTable(plan) => PyDropTable::from(plan.clone()).to_variant(py), + DdlStatement::DropView(plan) => PyDropView::from(plan.clone()).to_variant(py), + DdlStatement::DropCatalogSchema(plan) => { + PyDropCatalogSchema::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateFunction(plan) => { + PyCreateFunction::from(plan.clone()).to_variant(py) + } + DdlStatement::DropFunction(plan) => { + PyDropFunction::from(plan.clone()).to_variant(py) + } + }, + LogicalPlan::Copy(plan) => PyCopyTo::from(plan.clone()).to_variant(py), + LogicalPlan::DescribeTable(plan) => PyDescribeTable::from(plan.clone()).to_variant(py), + LogicalPlan::RecursiveQuery(plan) => { + PyRecursiveQuery::from(plan.clone()).to_variant(py) + } } } From 7d8bcd8d20623beb76a397eb4fddfb18781589eb Mon Sep 17 00:00:00 2001 From: kosiew Date: Mon, 5 May 2025 21:50:52 +0800 Subject: [PATCH 130/248] Partial fix for 1078: Enhance DataFrame Formatter Configuration with Memory and Display Controls (#1119) * feat: add configurable max table bytes and min table rows for DataFrame display * Revert "feat: add configurable max table bytes and min table rows for DataFrame display" This reverts commit f9b78fa3180c5d6c20eaa3b6d0af7426d7084093. * feat: add FormatterConfig for configurable DataFrame display options * refactor: simplify attribute extraction in get_formatter_config function * refactor: remove hardcoded constants and use FormatterConfig for display options * refactor: simplify record batch collection by using FormatterConfig for display options * feat: add max_memory_bytes, min_rows_display, and repr_rows parameters to DataFrameHtmlFormatter * feat: add tests for HTML formatter row display settings and memory limit * refactor: extract Python formatter retrieval into a separate function * Revert "feat: add tests for HTML formatter row display settings and memory limit" This reverts commit e089d7b282e53e587116b11d92760e6d292ec871. * feat: add tests for HTML formatter row and memory limit configurations * Revert "feat: add tests for HTML formatter row and memory limit configurations" This reverts commit 4090fd2f7378855b045d6bfd1368d088cc9ada75. * feat: add tests for new parameters and validation in DataFrameHtmlFormatter * Reorganize tests * refactor: rename and restructure formatter functions for clarity and maintainability * feat: implement PythonFormatter struct and refactor formatter retrieval for improved clarity * refactor: improve comments and restructure FormatterConfig usage in PyDataFrame * Add DataFrame usage guide with HTML rendering customization options (#1108) * docs: enhance user guide with detailed DataFrame operations and examples * move /docs/source/api/dataframe.rst into user-guide * docs: remove DataFrame API documentation * docs: fix formatting inconsistencies in DataFrame user guide * Two minor corrections to documentation rendering --------- Co-authored-by: Tim Saucer * Update documentation * refactor: streamline HTML rendering documentation * refactor: extract validation logic into separate functions for clarity * Implement feature X to enhance user experience and optimize performance * feat: add validation method for FormatterConfig to ensure positive integer values * add comment - ensure minimum rows are collected even if memory or row limits are hit * Update html_formatter documentation * update tests * remove unused type hints from imports in html_formatter.py * remove redundant tests for DataFrameHtmlFormatter and clean up assertions * refactor get_attr function to support generic default values * build_formatter_config_from_python return PyResult * fix ruff errors * trigger ci * fix: remove redundant newline in test_custom_style_provider_html_formatter * add more tests * trigger ci * Fix ruff errors * fix clippy error * feat: add validation for parameters in configure_formatter * test: add tests for invalid parameters in configure_formatter * Fix ruff errors --------- Co-authored-by: Tim Saucer --- docs/source/user-guide/dataframe.rst | 52 +++++++- python/datafusion/html_formatter.py | 104 ++++++++++++--- python/tests/test_dataframe.py | 183 ++++++++++++++++++++++++--- src/dataframe.rs | 142 +++++++++++++++++---- 4 files changed, 413 insertions(+), 68 deletions(-) diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst index a78fd8073..11e3d7e72 100644 --- a/docs/source/user-guide/dataframe.rst +++ b/docs/source/user-guide/dataframe.rst @@ -75,13 +75,17 @@ You can customize how DataFrames are rendered in HTML by configuring the formatt # Change the default styling configure_formatter( - max_rows=50, # Maximum number of rows to display - max_width=None, # Maximum width in pixels (None for auto) - theme="light", # Theme: "light" or "dark" - precision=2, # Floating point precision - thousands_separator=",", # Separator for thousands - date_format="%Y-%m-%d", # Date format - truncate_width=20 # Max width for string columns before truncating + max_cell_length=25, # Maximum characters in a cell before truncation + max_width=1000, # Maximum width in pixels + max_height=300, # Maximum height in pixels + max_memory_bytes=2097152, # Maximum memory for rendering (2MB) + min_rows_display=20, # Minimum number of rows to display + repr_rows=10, # Number of rows to display in __repr__ + enable_cell_expansion=True,# Allow expanding truncated cells + custom_css=None, # Additional custom CSS + show_truncation_message=True, # Show message when data is truncated + style_provider=None, # Custom styling provider + use_shared_styles=True # Share styles across tables ) The formatter settings affect all DataFrames displayed after configuration. @@ -113,6 +117,25 @@ For advanced styling needs, you can create a custom style provider: # Apply the custom style provider configure_formatter(style_provider=MyStyleProvider()) +Performance Optimization with Shared Styles +------------------------------------------- +The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying +multiple DataFrames in notebook environments: + + .. code-block:: python + from datafusion.html_formatter import StyleProvider, configure_formatter + # Default: Use shared styles (recommended for notebooks) + configure_formatter(use_shared_styles=True) + + # Disable shared styles (each DataFrame includes its own styles) + configure_formatter(use_shared_styles=False) + +When ``use_shared_styles=True``: +- CSS styles and JavaScript are included only once per notebook session +- This reduces HTML output size and prevents style duplication +- Improves rendering performance with many DataFrames +- Applies consistent styling across all DataFrames + Creating a Custom Formatter --------------------------- @@ -177,3 +200,18 @@ You can also use a context manager to temporarily change formatting settings: # Back to default formatting df.show() + +Memory and Display Controls +--------------------------- + +You can control how much data is displayed and how much memory is used for rendering: + + .. code-block:: python + + configure_formatter( + max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display + min_rows_display=50, # Always show at least 50 rows + repr_rows=20 # Show 20 rows in __repr__ output + ) + +These parameters help balance comprehensive data display against performance considerations. \ No newline at end of file diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py index a50e14fd5..12a7e4553 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/html_formatter.py @@ -27,6 +27,36 @@ ) +def _validate_positive_int(value: Any, param_name: str) -> None: + """Validate that a parameter is a positive integer. + + Args: + value: The value to validate + param_name: Name of the parameter (used in error message) + + Raises: + ValueError: If the value is not a positive integer + """ + if not isinstance(value, int) or value <= 0: + msg = f"{param_name} must be a positive integer" + raise ValueError(msg) + + +def _validate_bool(value: Any, param_name: str) -> None: + """Validate that a parameter is a boolean. + + Args: + value: The value to validate + param_name: Name of the parameter (used in error message) + + Raises: + TypeError: If the value is not a boolean + """ + if not isinstance(value, bool): + msg = f"{param_name} must be a boolean" + raise TypeError(msg) + + @runtime_checkable class CellFormatter(Protocol): """Protocol for cell value formatters.""" @@ -91,6 +121,9 @@ class DataFrameHtmlFormatter: max_cell_length: Maximum characters to display in a cell before truncation max_width: Maximum width of the HTML table in pixels max_height: Maximum height of the HTML table in pixels + max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) + min_rows_display: Minimum number of rows to display + repr_rows: Default number of rows to display in repr output enable_cell_expansion: Whether to add expand/collapse buttons for long cell values custom_css: Additional CSS to include in the HTML output @@ -108,6 +141,9 @@ def __init__( max_cell_length: int = 25, max_width: int = 1000, max_height: int = 300, + max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB + min_rows_display: int = 20, + repr_rows: int = 10, enable_cell_expansion: bool = True, custom_css: Optional[str] = None, show_truncation_message: bool = True, @@ -124,6 +160,12 @@ def __init__( Maximum width of the displayed table in pixels. max_height : int, default 300 Maximum height of the displayed table in pixels. + max_memory_bytes : int, default 2097152 (2MB) + Maximum memory in bytes for rendered data. + min_rows_display : int, default 20 + Minimum number of rows to display. + repr_rows : int, default 10 + Default number of rows to display in repr output. enable_cell_expansion : bool, default True Whether to allow cells to expand when clicked. custom_css : str, optional @@ -139,7 +181,8 @@ def __init__( Raises: ------ ValueError - If max_cell_length, max_width, or max_height is not a positive integer. + If max_cell_length, max_width, max_height, max_memory_bytes, + min_rows_display, or repr_rows is not a positive integer. TypeError If enable_cell_expansion, show_truncation_message, or use_shared_styles is not a boolean, @@ -148,27 +191,17 @@ def __init__( protocol. """ # Validate numeric parameters - - if not isinstance(max_cell_length, int) or max_cell_length <= 0: - msg = "max_cell_length must be a positive integer" - raise ValueError(msg) - if not isinstance(max_width, int) or max_width <= 0: - msg = "max_width must be a positive integer" - raise ValueError(msg) - if not isinstance(max_height, int) or max_height <= 0: - msg = "max_height must be a positive integer" - raise ValueError(msg) + _validate_positive_int(max_cell_length, "max_cell_length") + _validate_positive_int(max_width, "max_width") + _validate_positive_int(max_height, "max_height") + _validate_positive_int(max_memory_bytes, "max_memory_bytes") + _validate_positive_int(min_rows_display, "min_rows_display") + _validate_positive_int(repr_rows, "repr_rows") # Validate boolean parameters - if not isinstance(enable_cell_expansion, bool): - msg = "enable_cell_expansion must be a boolean" - raise TypeError(msg) - if not isinstance(show_truncation_message, bool): - msg = "show_truncation_message must be a boolean" - raise TypeError(msg) - if not isinstance(use_shared_styles, bool): - msg = "use_shared_styles must be a boolean" - raise TypeError(msg) + _validate_bool(enable_cell_expansion, "enable_cell_expansion") + _validate_bool(show_truncation_message, "show_truncation_message") + _validate_bool(use_shared_styles, "use_shared_styles") # Validate custom_css if custom_css is not None and not isinstance(custom_css, str): @@ -183,6 +216,9 @@ def __init__( self.max_cell_length = max_cell_length self.max_width = max_width self.max_height = max_height + self.max_memory_bytes = max_memory_bytes + self.min_rows_display = min_rows_display + self.repr_rows = repr_rows self.enable_cell_expansion = enable_cell_expansion self.custom_css = custom_css self.show_truncation_message = show_truncation_message @@ -597,6 +633,9 @@ def configure_formatter(**kwargs: Any) -> None: **kwargs: Formatter configuration parameters like max_cell_length, max_width, max_height, enable_cell_expansion, etc. + Raises: + ValueError: If any invalid parameters are provided + Example: >>> from datafusion.html_formatter import configure_formatter >>> configure_formatter( @@ -606,6 +645,31 @@ def configure_formatter(**kwargs: Any) -> None: ... use_shared_styles=True ... ) """ + # Valid parameters accepted by DataFrameHtmlFormatter + valid_params = { + "max_cell_length", + "max_width", + "max_height", + "max_memory_bytes", + "min_rows_display", + "repr_rows", + "enable_cell_expansion", + "custom_css", + "show_truncation_message", + "style_provider", + "use_shared_styles", + } + + # Check for invalid parameters + invalid_params = set(kwargs) - valid_params + if invalid_params: + msg = ( + f"Invalid formatter parameters: {', '.join(invalid_params)}. " + f"Valid parameters are: {', '.join(valid_params)}" + ) + raise ValueError(msg) + + # Create and set formatter with validated parameters set_formatter(DataFrameHtmlFormatter(**kwargs)) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 464b884db..e01308c86 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -41,6 +41,8 @@ ) from pyarrow.csv import write_csv +MB = 1024 * 1024 + @pytest.fixture def ctx(): @@ -117,6 +119,31 @@ def clean_formatter_state(): reset_formatter() +# custom style for testing with html formatter +class CustomStyleProvider: + def get_cell_style(self) -> str: + return ( + "background-color: #f5f5f5; color: #333; padding: 8px; border: " + "1px solid #ddd;" + ) + + def get_header_style(self) -> str: + return ( + "background-color: #4285f4; color: white; font-weight: bold; " + "padding: 10px; border: 1px solid #3367d6;" + ) + + +def count_table_rows(html_content: str) -> int: + """Count the number of table rows in HTML content. + Args: + html_content: HTML string to analyze + Returns: + Number of table rows found (number of tags) + """ + return len(re.findall(r" str: - return ( - "background-color: #f5f5f5; color: #333; padding: 8px; border: " - "1px solid #ddd;" - ) - - def get_header_style(self) -> str: - return ( - "background-color: #4285f4; color: white; font-weight: bold; " - "padding: 10px; border: 1px solid #3367d6;" - ) - # Configure with custom style provider configure_formatter(style_provider=CustomStyleProvider()) @@ -917,6 +930,141 @@ def get_header_style(self) -> str: assert "color: #5af" in html_output # Even numbers +def test_html_formatter_memory(df, clean_formatter_state): + """Test the memory and row control parameters in DataFrameHtmlFormatter.""" + configure_formatter(max_memory_bytes=10, min_rows_display=1) + html_output = df._repr_html_() + + # Count the number of table rows in the output + tr_count = count_table_rows(html_output) + # With a tiny memory limit of 10 bytes, the formatter should display + # the minimum number of rows (1) plus a message about truncation + assert tr_count == 2 # 1 for header row, 1 for data row + assert "data truncated" in html_output.lower() + + configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1) + html_output = df._repr_html_() + # With larger memory limit and min_rows=2, should display all rows + tr_count = count_table_rows(html_output) + # Table should have header row (1) + 3 data rows = 4 rows + assert tr_count == 4 + # No truncation message should appear + assert "data truncated" not in html_output.lower() + + +def test_html_formatter_repr_rows(df, clean_formatter_state): + configure_formatter(min_rows_display=2, repr_rows=2) + html_output = df._repr_html_() + + tr_count = count_table_rows(html_output) + # Tabe should have header row (1) + 2 data rows = 3 rows + assert tr_count == 3 + + configure_formatter(min_rows_display=2, repr_rows=3) + html_output = df._repr_html_() + + tr_count = count_table_rows(html_output) + # Tabe should have header row (1) + 3 data rows = 4 rows + assert tr_count == 4 + + +def test_html_formatter_validation(): + # Test validation for invalid parameters + + with pytest.raises(ValueError, match="max_cell_length must be a positive integer"): + DataFrameHtmlFormatter(max_cell_length=0) + + with pytest.raises(ValueError, match="max_width must be a positive integer"): + DataFrameHtmlFormatter(max_width=0) + + with pytest.raises(ValueError, match="max_height must be a positive integer"): + DataFrameHtmlFormatter(max_height=0) + + with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"): + DataFrameHtmlFormatter(max_memory_bytes=0) + + with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"): + DataFrameHtmlFormatter(max_memory_bytes=-100) + + with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): + DataFrameHtmlFormatter(min_rows_display=0) + + with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): + DataFrameHtmlFormatter(min_rows_display=-5) + + with pytest.raises(ValueError, match="repr_rows must be a positive integer"): + DataFrameHtmlFormatter(repr_rows=0) + + with pytest.raises(ValueError, match="repr_rows must be a positive integer"): + DataFrameHtmlFormatter(repr_rows=-10) + + +def test_configure_formatter(df, clean_formatter_state): + """Test using custom style providers with the HTML formatter and configured + parameters.""" + + # these are non-default values + max_cell_length = 10 + max_width = 500 + max_height = 30 + max_memory_bytes = 3 * MB + min_rows_display = 2 + repr_rows = 2 + enable_cell_expansion = False + show_truncation_message = False + use_shared_styles = False + + reset_formatter() + formatter_default = get_formatter() + + assert formatter_default.max_cell_length != max_cell_length + assert formatter_default.max_width != max_width + assert formatter_default.max_height != max_height + assert formatter_default.max_memory_bytes != max_memory_bytes + assert formatter_default.min_rows_display != min_rows_display + assert formatter_default.repr_rows != repr_rows + assert formatter_default.enable_cell_expansion != enable_cell_expansion + assert formatter_default.show_truncation_message != show_truncation_message + assert formatter_default.use_shared_styles != use_shared_styles + + # Configure with custom style provider and additional parameters + configure_formatter( + max_cell_length=max_cell_length, + max_width=max_width, + max_height=max_height, + max_memory_bytes=max_memory_bytes, + min_rows_display=min_rows_display, + repr_rows=repr_rows, + enable_cell_expansion=enable_cell_expansion, + show_truncation_message=show_truncation_message, + use_shared_styles=use_shared_styles, + ) + formatter_custom = get_formatter() + assert formatter_custom.max_cell_length == max_cell_length + assert formatter_custom.max_width == max_width + assert formatter_custom.max_height == max_height + assert formatter_custom.max_memory_bytes == max_memory_bytes + assert formatter_custom.min_rows_display == min_rows_display + assert formatter_custom.repr_rows == repr_rows + assert formatter_custom.enable_cell_expansion == enable_cell_expansion + assert formatter_custom.show_truncation_message == show_truncation_message + assert formatter_custom.use_shared_styles == use_shared_styles + + +def test_configure_formatter_invalid_params(clean_formatter_state): + """Test that configure_formatter rejects invalid parameters.""" + with pytest.raises(ValueError, match="Invalid formatter parameters"): + configure_formatter(invalid_param=123) + + # Test with multiple parameters, one valid and one invalid + with pytest.raises(ValueError, match="Invalid formatter parameters"): + configure_formatter(max_width=500, not_a_real_param="test") + + # Test with multiple invalid parameters + with pytest.raises(ValueError, match="Invalid formatter parameters"): + configure_formatter(fake_param1="test", fake_param2=456) + + def test_get_dataframe(tmp_path): ctx = SessionContext() @@ -1505,9 +1653,8 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: assert result["new_col"] == [3 for _i in range(3)] -def test_dataframe_repr_html_structure(df) -> None: +def test_dataframe_repr_html_structure(df, clean_formatter_state) -> None: """Test that DataFrame._repr_html_ produces expected HTML output structure.""" - import re output = df._repr_html_() @@ -1537,7 +1684,7 @@ def test_dataframe_repr_html_structure(df) -> None: assert len(body_matches) == 1, "Expected pattern of values not found in HTML output" -def test_dataframe_repr_html_values(df): +def test_dataframe_repr_html_values(df, clean_formatter_state): """Test that DataFrame._repr_html_ contains the expected data values.""" html = df._repr_html_() assert html is not None diff --git a/src/dataframe.rs b/src/dataframe.rs index 787f63520..211e31bd1 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -71,8 +71,103 @@ impl PyTableProvider { PyTable::new(table_provider) } } -const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB -const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20; + +/// Configuration for DataFrame display formatting +#[derive(Debug, Clone)] +pub struct FormatterConfig { + /// Maximum memory in bytes to use for display (default: 2MB) + pub max_bytes: usize, + /// Minimum number of rows to display (default: 20) + pub min_rows: usize, + /// Number of rows to include in __repr__ output (default: 10) + pub repr_rows: usize, +} + +impl Default for FormatterConfig { + fn default() -> Self { + Self { + max_bytes: 2 * 1024 * 1024, // 2MB + min_rows: 20, + repr_rows: 10, + } + } +} + +impl FormatterConfig { + /// Validates that all configuration values are positive integers. + /// + /// # Returns + /// + /// `Ok(())` if all values are valid, or an `Err` with a descriptive error message. + pub fn validate(&self) -> Result<(), String> { + if self.max_bytes == 0 { + return Err("max_bytes must be a positive integer".to_string()); + } + + if self.min_rows == 0 { + return Err("min_rows must be a positive integer".to_string()); + } + + if self.repr_rows == 0 { + return Err("repr_rows must be a positive integer".to_string()); + } + + Ok(()) + } +} + +/// Holds the Python formatter and its configuration +struct PythonFormatter<'py> { + /// The Python formatter object + formatter: Bound<'py, PyAny>, + /// The formatter configuration + config: FormatterConfig, +} + +/// Get the Python formatter and its configuration +fn get_python_formatter_with_config(py: Python) -> PyResult { + let formatter = import_python_formatter(py)?; + let config = build_formatter_config_from_python(&formatter)?; + Ok(PythonFormatter { formatter, config }) +} + +/// Get the Python formatter from the datafusion.html_formatter module +fn import_python_formatter(py: Python) -> PyResult> { + let formatter_module = py.import("datafusion.html_formatter")?; + let get_formatter = formatter_module.getattr("get_formatter")?; + get_formatter.call0() +} + +// Helper function to extract attributes with fallback to default +fn get_attr<'a, T>(py_object: &'a Bound<'a, PyAny>, attr_name: &str, default_value: T) -> T +where + T: for<'py> pyo3::FromPyObject<'py> + Clone, +{ + py_object + .getattr(attr_name) + .and_then(|v| v.extract::()) + .unwrap_or_else(|_| default_value.clone()) +} + +/// Helper function to create a FormatterConfig from a Python formatter object +fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult { + let default_config = FormatterConfig::default(); + let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes); + let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows); + let repr_rows = get_attr(formatter, "repr_rows", default_config.repr_rows); + + let config = FormatterConfig { + max_bytes, + min_rows, + repr_rows, + }; + + // Return the validated config, converting String error to PyErr + config + .validate() + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e))?; + Ok(config) +} /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -114,9 +209,14 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { + // Get the Python formatter config + let PythonFormatter { + formatter: _, + config, + } = get_python_formatter_with_config(py)?; let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), + collect_record_batches_to_display(self.df.as_ref().clone(), config), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below @@ -135,13 +235,11 @@ impl PyDataFrame { } fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + // Get the Python formatter and config + let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display( - self.df.as_ref().clone(), - MIN_TABLE_ROWS_TO_DISPLAY, - usize::MAX, - ), + collect_record_batches_to_display(self.df.as_ref().clone(), config), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below @@ -158,12 +256,6 @@ impl PyDataFrame { let py_schema = self.schema().into_pyobject(py)?; - // Get the Python formatter module and call format_html - let formatter_module = py.import("datafusion.html_formatter")?; - let get_formatter = formatter_module.getattr("get_formatter")?; - let formatter = get_formatter.call0()?; - - // Call format_html method on the formatter let kwargs = pyo3::types::PyDict::new(py); let py_batches_list = PyList::new(py, py_batches.as_slice())?; kwargs.set_item("batches", py_batches_list)?; @@ -796,9 +888,14 @@ fn record_batch_into_schema( /// rows, set min_rows == max_rows. async fn collect_record_batches_to_display( df: DataFrame, - min_rows: usize, - max_rows: usize, + config: FormatterConfig, ) -> Result<(Vec, bool), DataFusionError> { + let FormatterConfig { + max_bytes, + min_rows, + repr_rows, + } = config; + let partitioned_stream = df.execute_stream_partitioned().await?; let mut stream = futures::stream::iter(partitioned_stream).flatten(); let mut size_estimate_so_far = 0; @@ -806,9 +903,8 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows) - || rows_so_far < min_rows - { + // ensure minimum rows even if memory/row limits are hit + while (size_estimate_so_far < max_bytes && rows_so_far < repr_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { None => { break; @@ -821,8 +917,8 @@ async fn collect_record_batches_to_display( if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); - if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY { - let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32; + if size_estimate_so_far > max_bytes { + let ratio = max_bytes as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; @@ -838,8 +934,8 @@ async fn collect_record_batches_to_display( } } - if rows_in_rb + rows_so_far > max_rows { - rb = rb.slice(0, max_rows - rows_so_far); + if rows_in_rb + rows_so_far > repr_rows { + rb = rb.slice(0, repr_rows - rows_so_far); has_more = true; } From f3c98ec7a2eb325041530b1ae8d6de41aa558037 Mon Sep 17 00:00:00 2001 From: kosiew Date: Fri, 16 May 2025 14:34:19 +0800 Subject: [PATCH 131/248] Add fill_null method to DataFrame API for handling missing values (#1019) * feat: add fill_null method to DataFrame for handling null values * test: add coalesce function tests for handling default values * Resolve test cases for fill_null * feat: add fill_nan method to DataFrame for handling NaN values * move imports out of functions * docs: add documentation for fill_null and fill_nan methods in DataFrame * Add more tests * fix ruff errors * amend def fill_null to invoke PyDataFrame's fill_null - Implemented `fill_null` method in `dataframe.rs` to allow filling null values with a specified value for specific columns or all columns. - Added a helper function `python_value_to_scalar_value` to convert Python values to DataFusion ScalarValues, supporting various types including integers, floats, booleans, strings, and timestamps. - Updated the `count` method in `PyDataFrame` to maintain functionality. * refactor: remove fill_nan method documentation from functions.rst * refactor: remove unused import of Enum from dataframe.py * refactor: improve error handling and type extraction in python_value_to_scalar_value function * refactor: enhance datetime and date conversion logic in python_value_to_scalar_value function * refactor: streamline type extraction in python_value_to_scalar_value function * fix try_convert_to_string * refactor: improve type handling in python_value_to_scalar_value function * refactor: move py_obj_to_scalar_value function to utils module * refactor: update fill_null to use py_obj_to_scalar_value from utils * Remove python_object_to_scalar_value code * refactor: enhance py_obj_to_scalar_value to utilize PyArrow for complex type conversion * refactor: update py_obj_to_scalar_value to handle errors and use extract_bound for PyArrow scalar conversion * refactor: modify py_obj_to_scalar_value to return ScalarValue directly and streamline error handling * refactor: update py_obj_to_scalar_value to return a Result for better error handling * test: add tests for fill_null functionality in DataFrame with null values * test: enhance null DataFrame tests to include date32 and date64 columns * refactor: simplify py_obj_to_scalar_value by removing direct extraction of basic types * refactor: remove unnecessary documentation from py_obj_to_scalar_value function * Fix ruff errors * test: update datetime handling in coalesce tests to include timezone information * Fix ruff errors * trigger ci --- .../common-operations/functions.rst | 21 ++ python/datafusion/dataframe.py | 26 +- python/tests/test_dataframe.py | 266 ++++++++++++++++++ python/tests/test_functions.py | 61 ++++ src/config.rs | 21 +- src/dataframe.rs | 23 +- src/utils.rs | 18 ++ 7 files changed, 414 insertions(+), 22 deletions(-) diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index 12097be8f..d458d3eb0 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -129,3 +129,24 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f .limit(20) .to_pandas() ) + + +Handling Missing Values +===================== + +DataFusion provides methods to handle missing values in DataFrames: + +fill_null +--------- + +The ``fill_null()`` method replaces NULL values in specified columns with a provided value: + +.. code-block:: python + + # Fill all NULL values with 0 where possible + df = df.fill_null(0) + + # Fill NULL values only in specific string columns + df = df.fill_null("missing", subset=["name", "category"]) + +The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged. diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 26fe8f453..a1df7e080 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -37,6 +37,8 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 +from datafusion._internal import DataFrame as DataFrameInternal +from datafusion.expr import Expr, SortExpr, sort_or_default from datafusion.plan import ExecutionPlan, LogicalPlan from datafusion.record_batch import RecordBatchStream @@ -53,8 +55,6 @@ from enum import Enum -from datafusion.expr import Expr, SortExpr, sort_or_default - # excerpt from deltalake # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 @@ -869,3 +869,25 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame: DataFrame: After applying func to the original dataframe. """ return func(self, *args) + + def fill_null(self, value: Any, subset: list[str] | None = None) -> DataFrame: + """Fill null values in specified columns with a value. + + Args: + value: Value to replace nulls with. Will be cast to match column type. + subset: Optional list of column names to fill. If None, fills all columns. + + Returns: + DataFrame with null values replaced where type casting is possible + + Examples: + >>> df = df.fill_null(0) # Fill all nulls with 0 where possible + >>> # Fill nulls in specific string columns + >>> df = df.fill_null("missing", subset=["name", "category"]) + + Notes: + - Only fills nulls in columns where the value can be cast to the column type + - For columns where casting fails, the original column is kept unchanged + - For columns not in subset, the original column is kept unchanged + """ + return DataFrame(self.df.fill_null(value, subset)) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e01308c86..dd5f962b2 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import datetime import os import re from typing import Any @@ -119,6 +120,38 @@ def clean_formatter_state(): reset_formatter() +@pytest.fixture +def null_df(): + """Create a DataFrame with null values of different types.""" + ctx = SessionContext() + + # Create a RecordBatch with nulls across different types + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, None, 3, None], type=pa.int64()), + pa.array([4.5, 6.7, None, None], type=pa.float64()), + pa.array(["a", None, "c", None], type=pa.string()), + pa.array([True, None, False, None], type=pa.bool_()), + pa.array( + [10957, None, 18993, None], type=pa.date32() + ), # 2000-01-01, null, 2022-01-01, null + pa.array( + [946684800000, None, 1640995200000, None], type=pa.date64() + ), # 2000-01-01, null, 2022-01-01, null + ], + names=[ + "int_col", + "float_col", + "str_col", + "bool_col", + "date32_col", + "date64_col", + ], + ) + + return ctx.create_dataframe([[batch]]) + + # custom style for testing with html formatter class CustomStyleProvider: def get_cell_style(self) -> str: @@ -1794,3 +1827,236 @@ def test_html_formatter_manual_format_html(clean_formatter_state): assert "") + return html + + def _build_table_container_start(self) -> list[str]: + """Build the opening tags for the table container.""" + html = [] + html.append( + f'
' + ) + html.append('') + return html + + def _build_table_header(self, schema: Any) -> list[str]: + """Build the HTML table header with column names.""" + html = [] + html.append("") + html.append("") + for field in schema: + if self._custom_header_builder: + html.append(self._custom_header_builder(field)) + else: + html.append( + f"" + ) + html.append("") + html.append("") + return html + + def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: + """Build the HTML table body with data rows.""" + html = [] + html.append("") + + row_count = 0 + for batch in batches: + for row_idx in range(batch.num_rows): + row_count += 1 + html.append("") + + for col_idx, column in enumerate(batch.columns): + # Get the raw value from the column + raw_value = self._get_cell_value(column, row_idx) + + # Always check for type formatters first to format the value + formatted_value = self._format_cell_value(raw_value) + + # Then apply either custom cell builder or standard cell formatting + if self._custom_cell_builder: + # Pass both the raw value and formatted value to let the + # builder decide + cell_html = self._custom_cell_builder( + raw_value, row_count, col_idx, table_uuid + ) + html.append(cell_html) + else: + # Standard cell formatting with formatted value + if ( + len(str(raw_value)) > self.max_cell_length + and self.enable_cell_expansion + ): + cell_html = self._build_expandable_cell( + formatted_value, row_count, col_idx, table_uuid + ) + else: + cell_html = self._build_regular_cell(formatted_value) + html.append(cell_html) + + html.append("") + + html.append("") + return html + + def _get_cell_value(self, column: Any, row_idx: int) -> Any: + """Extract a cell value from a column. + + Args: + column: Arrow array + row_idx: Row index + + Returns: + The raw cell value + """ + try: + value = column[row_idx] + + if hasattr(value, "as_py"): + return value.as_py() + except (AttributeError, TypeError): + pass + else: + return value + + def _format_cell_value(self, value: Any) -> str: + """Format a cell value for display. + + Uses registered type formatters if available. + + Args: + value: The cell value to format + + Returns: + Formatted cell value as string + """ + # Check for custom type formatters + for type_cls, formatter in self._type_formatters.items(): + if isinstance(value, type_cls): + return formatter(value) + + # If no formatter matched, return string representation + return str(value) + + def _build_expandable_cell( + self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str + ) -> str: + """Build an expandable cell for long content.""" + short_value = str(formatted_value)[: self.max_cell_length] + return ( + f"" + ) + + def _build_regular_cell(self, formatted_value: str) -> str: + """Build a regular table cell.""" + return ( + f"" + ) + + def _build_html_footer(self, has_more: bool) -> list[str]: + """Build the HTML footer with JavaScript and messages.""" + html = [] + + # Add JavaScript for interactivity only if cell expansion is enabled + # and we're not using the shared styles approach + if self.enable_cell_expansion and not self.use_shared_styles: + html.append(self._get_javascript()) + + # Add truncation message if needed + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return html + + def _get_default_css(self) -> str: + """Get default CSS styles for the HTML table.""" + return """ + .expandable-container { + display: inline-block; + max-width: 200px; + } + .expandable { + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + display: block; + } + .full-text { + display: none; + white-space: normal; + } + .expand-btn { + cursor: pointer; + color: blue; + text-decoration: underline; + border: none; + background: none; + font-size: inherit; + display: block; + margin-top: 5px; + } + """ + + def _get_javascript(self) -> str: + """Get JavaScript code for interactive elements.""" + return """ + + """ + + +class FormatterManager: + """Manager class for the global DataFrame HTML formatter instance.""" + + _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() + + @classmethod + def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + """ + cls._default_formatter = formatter + _refresh_formatter_reference() + + @classmethod + def get_formatter(cls) -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + Returns: + The global HTML formatter instance + """ + return cls._default_formatter + + +def get_formatter() -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + This function is used by the DataFrame._repr_html_ implementation to access + the shared formatter instance. It can also be used directly when custom + HTML rendering is needed. + + Returns: + The global HTML formatter instance + + Example: + >>> from datafusion.html_formatter import get_formatter + >>> formatter = get_formatter() + >>> formatter.max_cell_length = 50 # Increase cell length + """ + return FormatterManager.get_formatter() + + +def set_formatter(formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + + Example: + >>> from datafusion.html_formatter import get_formatter, set_formatter + >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) + >>> set_formatter(custom_formatter) + """ + FormatterManager.set_formatter(formatter) + + +def configure_formatter(**kwargs: Any) -> None: + """Configure the global DataFrame HTML formatter. + + This function creates a new formatter with the provided configuration + and sets it as the global formatter for all DataFrames. + + Args: + **kwargs: Formatter configuration parameters like max_cell_length, + max_width, max_height, enable_cell_expansion, etc. + + Raises: + ValueError: If any invalid parameters are provided + + Example: + >>> from datafusion.html_formatter import configure_formatter + >>> configure_formatter( + ... max_cell_length=50, + ... max_height=500, + ... enable_cell_expansion=True, + ... use_shared_styles=True + ... ) + """ + # Valid parameters accepted by DataFrameHtmlFormatter + valid_params = { + "max_cell_length", + "max_width", + "max_height", + "max_memory_bytes", + "min_rows_display", + "repr_rows", + "enable_cell_expansion", + "custom_css", + "show_truncation_message", + "style_provider", + "use_shared_styles", + } + + # Check for invalid parameters + invalid_params = set(kwargs) - valid_params + if invalid_params: + msg = ( + f"Invalid formatter parameters: {', '.join(invalid_params)}. " + f"Valid parameters are: {', '.join(valid_params)}" + ) + raise ValueError(msg) + + # Create and set formatter with validated parameters + set_formatter(DataFrameHtmlFormatter(**kwargs)) + + +def reset_formatter() -> None: + """Reset the global DataFrame HTML formatter to default settings. + + This function creates a new formatter with default configuration + and sets it as the global formatter for all DataFrames. + + Example: + >>> from datafusion.html_formatter import reset_formatter + >>> reset_formatter() # Reset formatter to default settings + """ + formatter = DataFrameHtmlFormatter() + # Reset the styles_loaded flag to ensure styles will be reloaded + DataFrameHtmlFormatter._styles_loaded = False + set_formatter(formatter) + + +def reset_styles_loaded_state() -> None: + """Reset the styles loaded state to force reloading of styles. + + This can be useful when switching between notebook sessions or + when styles need to be refreshed. + + Example: + >>> from datafusion.html_formatter import reset_styles_loaded_state + >>> reset_styles_loaded_state() # Force styles to reload in next render + """ + DataFrameHtmlFormatter._styles_loaded = False + + +def _refresh_formatter_reference() -> None: + """Refresh formatter reference in any modules using it. + + This helps ensure that changes to the formatter are reflected in existing + DataFrames that might be caching the formatter reference. + """ + # This is a no-op but signals modules to refresh their reference diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py index 12a7e4553..65eb1f042 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/html_formatter.py @@ -14,698 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""HTML formatting utilities for DataFusion DataFrames.""" -from __future__ import annotations +"""Deprecated module for dataframe formatting.""" -from typing import ( - Any, - Callable, - Optional, - Protocol, - runtime_checkable, -) - - -def _validate_positive_int(value: Any, param_name: str) -> None: - """Validate that a parameter is a positive integer. - - Args: - value: The value to validate - param_name: Name of the parameter (used in error message) - - Raises: - ValueError: If the value is not a positive integer - """ - if not isinstance(value, int) or value <= 0: - msg = f"{param_name} must be a positive integer" - raise ValueError(msg) - - -def _validate_bool(value: Any, param_name: str) -> None: - """Validate that a parameter is a boolean. - - Args: - value: The value to validate - param_name: Name of the parameter (used in error message) - - Raises: - TypeError: If the value is not a boolean - """ - if not isinstance(value, bool): - msg = f"{param_name} must be a boolean" - raise TypeError(msg) - - -@runtime_checkable -class CellFormatter(Protocol): - """Protocol for cell value formatters.""" - - def __call__(self, value: Any) -> str: - """Format a cell value to string representation.""" - ... - - -@runtime_checkable -class StyleProvider(Protocol): - """Protocol for HTML style providers.""" - - def get_cell_style(self) -> str: - """Get the CSS style for table cells.""" - ... - - def get_header_style(self) -> str: - """Get the CSS style for header cells.""" - ... - - -class DefaultStyleProvider: - """Default implementation of StyleProvider.""" - - def get_cell_style(self) -> str: - """Get the CSS style for table cells. - - Returns: - CSS style string - """ - return ( - "border: 1px solid black; padding: 8px; text-align: left; " - "white-space: nowrap;" - ) - - def get_header_style(self) -> str: - """Get the CSS style for header cells. - - Returns: - CSS style string - """ - return ( - "border: 1px solid black; padding: 8px; text-align: left; " - "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; " - "max-width: fit-content;" - ) - - -class DataFrameHtmlFormatter: - """Configurable HTML formatter for DataFusion DataFrames. - - This class handles the HTML rendering of DataFrames for display in - Jupyter notebooks and other rich display contexts. - - This class supports extension through composition. Key extension points: - - Provide a custom StyleProvider for styling cells and headers - - Register custom formatters for specific types - - Provide custom cell builders for specialized cell rendering - - Args: - max_cell_length: Maximum characters to display in a cell before truncation - max_width: Maximum width of the HTML table in pixels - max_height: Maximum height of the HTML table in pixels - max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) - min_rows_display: Minimum number of rows to display - repr_rows: Default number of rows to display in repr output - enable_cell_expansion: Whether to add expand/collapse buttons for long cell - values - custom_css: Additional CSS to include in the HTML output - show_truncation_message: Whether to display a message when data is truncated - style_provider: Custom provider for cell and header styles - use_shared_styles: Whether to load styles and scripts only once per notebook - session - """ - - # Class variable to track if styles have been loaded in the notebook - _styles_loaded = False - - def __init__( - self, - max_cell_length: int = 25, - max_width: int = 1000, - max_height: int = 300, - max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB - min_rows_display: int = 20, - repr_rows: int = 10, - enable_cell_expansion: bool = True, - custom_css: Optional[str] = None, - show_truncation_message: bool = True, - style_provider: Optional[StyleProvider] = None, - use_shared_styles: bool = True, - ) -> None: - """Initialize the HTML formatter. - - Parameters - ---------- - max_cell_length : int, default 25 - Maximum length of cell content before truncation. - max_width : int, default 1000 - Maximum width of the displayed table in pixels. - max_height : int, default 300 - Maximum height of the displayed table in pixels. - max_memory_bytes : int, default 2097152 (2MB) - Maximum memory in bytes for rendered data. - min_rows_display : int, default 20 - Minimum number of rows to display. - repr_rows : int, default 10 - Default number of rows to display in repr output. - enable_cell_expansion : bool, default True - Whether to allow cells to expand when clicked. - custom_css : str, optional - Custom CSS to apply to the HTML table. - show_truncation_message : bool, default True - Whether to show a message indicating that content has been truncated. - style_provider : StyleProvider, optional - Provider of CSS styles for the HTML table. If None, DefaultStyleProvider - is used. - use_shared_styles : bool, default True - Whether to use shared styles across multiple tables. - - Raises: - ------ - ValueError - If max_cell_length, max_width, max_height, max_memory_bytes, - min_rows_display, or repr_rows is not a positive integer. - TypeError - If enable_cell_expansion, show_truncation_message, or use_shared_styles is - not a boolean, - or if custom_css is provided but is not a string, - or if style_provider is provided but does not implement the StyleProvider - protocol. - """ - # Validate numeric parameters - _validate_positive_int(max_cell_length, "max_cell_length") - _validate_positive_int(max_width, "max_width") - _validate_positive_int(max_height, "max_height") - _validate_positive_int(max_memory_bytes, "max_memory_bytes") - _validate_positive_int(min_rows_display, "min_rows_display") - _validate_positive_int(repr_rows, "repr_rows") - - # Validate boolean parameters - _validate_bool(enable_cell_expansion, "enable_cell_expansion") - _validate_bool(show_truncation_message, "show_truncation_message") - _validate_bool(use_shared_styles, "use_shared_styles") - - # Validate custom_css - if custom_css is not None and not isinstance(custom_css, str): - msg = "custom_css must be None or a string" - raise TypeError(msg) - - # Validate style_provider - if style_provider is not None and not isinstance(style_provider, StyleProvider): - msg = "style_provider must implement the StyleProvider protocol" - raise TypeError(msg) - - self.max_cell_length = max_cell_length - self.max_width = max_width - self.max_height = max_height - self.max_memory_bytes = max_memory_bytes - self.min_rows_display = min_rows_display - self.repr_rows = repr_rows - self.enable_cell_expansion = enable_cell_expansion - self.custom_css = custom_css - self.show_truncation_message = show_truncation_message - self.style_provider = style_provider or DefaultStyleProvider() - self.use_shared_styles = use_shared_styles - # Registry for custom type formatters - self._type_formatters: dict[type, CellFormatter] = {} - # Custom cell builders - self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None - self._custom_header_builder: Optional[Callable[[Any], str]] = None - - def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: - """Register a custom formatter for a specific data type. - - Args: - type_class: The type to register a formatter for - formatter: Function that takes a value of the given type and returns - a formatted string - """ - self._type_formatters[type_class] = formatter - - def set_custom_cell_builder( - self, builder: Callable[[Any, int, int, str], str] - ) -> None: - """Set a custom cell builder function. - - Args: - builder: Function that takes (value, row, col, table_id) and returns HTML - """ - self._custom_cell_builder = builder - - def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: - """Set a custom header builder function. - - Args: - builder: Function that takes a field and returns HTML - """ - self._custom_header_builder = builder - - @classmethod - def is_styles_loaded(cls) -> bool: - """Check if HTML styles have been loaded in the current session. - - This method is primarily intended for debugging UI rendering issues - related to style loading. - - Returns: - True if styles have been loaded, False otherwise - - Example: - >>> from datafusion.html_formatter import DataFrameHtmlFormatter - >>> DataFrameHtmlFormatter.is_styles_loaded() - False - """ - return cls._styles_loaded - - def format_html( - self, - batches: list, - schema: Any, - has_more: bool = False, - table_uuid: str | None = None, - ) -> str: - """Format record batches as HTML. - - This method is used by DataFrame's _repr_html_ implementation and can be - called directly when custom HTML rendering is needed. - - Args: - batches: List of Arrow RecordBatch objects - schema: Arrow Schema object - has_more: Whether there are more batches not shown - table_uuid: Unique ID for the table, used for JavaScript interactions - - Returns: - HTML string representation of the data - - Raises: - TypeError: If schema is invalid and no batches are provided - """ - if not batches: - return "No data to display" - - # Validate schema - if schema is None or not hasattr(schema, "__iter__"): - msg = "Schema must be provided" - raise TypeError(msg) - - # Generate a unique ID if none provided - table_uuid = table_uuid or f"df-{id(batches)}" - - # Build HTML components - html = [] - - # Only include styles and scripts if: - # 1. Not using shared styles, OR - # 2. Using shared styles but they haven't been loaded yet - include_styles = ( - not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded - ) - - if include_styles: - html.extend(self._build_html_header()) - # If we're using shared styles, mark them as loaded - if self.use_shared_styles: - DataFrameHtmlFormatter._styles_loaded = True - - html.extend(self._build_table_container_start()) - - # Add table header and body - html.extend(self._build_table_header(schema)) - html.extend(self._build_table_body(batches, table_uuid)) - - html.append("
" + f"{field.name}
" + f"
" + "" + "" + f"{formatted_value}" + f"" + f"
" + f"
{formatted_value}
") - html.append("
") - - # Add footer (JavaScript and messages) - if include_styles and self.enable_cell_expansion: - html.append(self._get_javascript()) - - # Always add truncation message if needed (independent of styles) - if has_more and self.show_truncation_message: - html.append("
Data truncated due to size.
") - - return "\n".join(html) - - def _build_html_header(self) -> list[str]: - """Build the HTML header with CSS styles.""" - html = [] - html.append("") - return html +import warnings - def _build_table_container_start(self) -> list[str]: - """Build the opening tags for the table container.""" - html = [] - html.append( - f'
' - ) - html.append('') - return html +from datafusion.dataframe_formatter import * # noqa: F403 - def _build_table_header(self, schema: Any) -> list[str]: - """Build the HTML table header with column names.""" - html = [] - html.append("") - html.append("") - for field in schema: - if self._custom_header_builder: - html.append(self._custom_header_builder(field)) - else: - html.append( - f"" - ) - html.append("") - html.append("") - return html - - def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: - """Build the HTML table body with data rows.""" - html = [] - html.append("") - - row_count = 0 - for batch in batches: - for row_idx in range(batch.num_rows): - row_count += 1 - html.append("") - - for col_idx, column in enumerate(batch.columns): - # Get the raw value from the column - raw_value = self._get_cell_value(column, row_idx) - - # Always check for type formatters first to format the value - formatted_value = self._format_cell_value(raw_value) - - # Then apply either custom cell builder or standard cell formatting - if self._custom_cell_builder: - # Pass both the raw value and formatted value to let the - # builder decide - cell_html = self._custom_cell_builder( - raw_value, row_count, col_idx, table_uuid - ) - html.append(cell_html) - else: - # Standard cell formatting with formatted value - if ( - len(str(raw_value)) > self.max_cell_length - and self.enable_cell_expansion - ): - cell_html = self._build_expandable_cell( - formatted_value, row_count, col_idx, table_uuid - ) - else: - cell_html = self._build_regular_cell(formatted_value) - html.append(cell_html) - - html.append("") - - html.append("") - return html - - def _get_cell_value(self, column: Any, row_idx: int) -> Any: - """Extract a cell value from a column. - - Args: - column: Arrow array - row_idx: Row index - - Returns: - The raw cell value - """ - try: - value = column[row_idx] - - if hasattr(value, "as_py"): - return value.as_py() - except (AttributeError, TypeError): - pass - else: - return value - - def _format_cell_value(self, value: Any) -> str: - """Format a cell value for display. - - Uses registered type formatters if available. - - Args: - value: The cell value to format - - Returns: - Formatted cell value as string - """ - # Check for custom type formatters - for type_cls, formatter in self._type_formatters.items(): - if isinstance(value, type_cls): - return formatter(value) - - # If no formatter matched, return string representation - return str(value) - - def _build_expandable_cell( - self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str - ) -> str: - """Build an expandable cell for long content.""" - short_value = str(formatted_value)[: self.max_cell_length] - return ( - f"" - ) - - def _build_regular_cell(self, formatted_value: str) -> str: - """Build a regular table cell.""" - return ( - f"" - ) - - def _build_html_footer(self, has_more: bool) -> list[str]: - """Build the HTML footer with JavaScript and messages.""" - html = [] - - # Add JavaScript for interactivity only if cell expansion is enabled - # and we're not using the shared styles approach - if self.enable_cell_expansion and not self.use_shared_styles: - html.append(self._get_javascript()) - - # Add truncation message if needed - if has_more and self.show_truncation_message: - html.append("
Data truncated due to size.
") - - return html - - def _get_default_css(self) -> str: - """Get default CSS styles for the HTML table.""" - return """ - .expandable-container { - display: inline-block; - max-width: 200px; - } - .expandable { - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; - display: block; - } - .full-text { - display: none; - white-space: normal; - } - .expand-btn { - cursor: pointer; - color: blue; - text-decoration: underline; - border: none; - background: none; - font-size: inherit; - display: block; - margin-top: 5px; - } - """ - - def _get_javascript(self) -> str: - """Get JavaScript code for interactive elements.""" - return """ - - """ - - -class FormatterManager: - """Manager class for the global DataFrame HTML formatter instance.""" - - _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() - - @classmethod - def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: - """Set the global DataFrame HTML formatter. - - Args: - formatter: The formatter instance to use globally - """ - cls._default_formatter = formatter - _refresh_formatter_reference() - - @classmethod - def get_formatter(cls) -> DataFrameHtmlFormatter: - """Get the current global DataFrame HTML formatter. - - Returns: - The global HTML formatter instance - """ - return cls._default_formatter - - -def get_formatter() -> DataFrameHtmlFormatter: - """Get the current global DataFrame HTML formatter. - - This function is used by the DataFrame._repr_html_ implementation to access - the shared formatter instance. It can also be used directly when custom - HTML rendering is needed. - - Returns: - The global HTML formatter instance - - Example: - >>> from datafusion.html_formatter import get_formatter - >>> formatter = get_formatter() - >>> formatter.max_cell_length = 50 # Increase cell length - """ - return FormatterManager.get_formatter() - - -def set_formatter(formatter: DataFrameHtmlFormatter) -> None: - """Set the global DataFrame HTML formatter. - - Args: - formatter: The formatter instance to use globally - - Example: - >>> from datafusion.html_formatter import get_formatter, set_formatter - >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) - >>> set_formatter(custom_formatter) - """ - FormatterManager.set_formatter(formatter) - - -def configure_formatter(**kwargs: Any) -> None: - """Configure the global DataFrame HTML formatter. - - This function creates a new formatter with the provided configuration - and sets it as the global formatter for all DataFrames. - - Args: - **kwargs: Formatter configuration parameters like max_cell_length, - max_width, max_height, enable_cell_expansion, etc. - - Raises: - ValueError: If any invalid parameters are provided - - Example: - >>> from datafusion.html_formatter import configure_formatter - >>> configure_formatter( - ... max_cell_length=50, - ... max_height=500, - ... enable_cell_expansion=True, - ... use_shared_styles=True - ... ) - """ - # Valid parameters accepted by DataFrameHtmlFormatter - valid_params = { - "max_cell_length", - "max_width", - "max_height", - "max_memory_bytes", - "min_rows_display", - "repr_rows", - "enable_cell_expansion", - "custom_css", - "show_truncation_message", - "style_provider", - "use_shared_styles", - } - - # Check for invalid parameters - invalid_params = set(kwargs) - valid_params - if invalid_params: - msg = ( - f"Invalid formatter parameters: {', '.join(invalid_params)}. " - f"Valid parameters are: {', '.join(valid_params)}" - ) - raise ValueError(msg) - - # Create and set formatter with validated parameters - set_formatter(DataFrameHtmlFormatter(**kwargs)) - - -def reset_formatter() -> None: - """Reset the global DataFrame HTML formatter to default settings. - - This function creates a new formatter with default configuration - and sets it as the global formatter for all DataFrames. - - Example: - >>> from datafusion.html_formatter import reset_formatter - >>> reset_formatter() # Reset formatter to default settings - """ - formatter = DataFrameHtmlFormatter() - # Reset the styles_loaded flag to ensure styles will be reloaded - DataFrameHtmlFormatter._styles_loaded = False - set_formatter(formatter) - - -def reset_styles_loaded_state() -> None: - """Reset the styles loaded state to force reloading of styles. - - This can be useful when switching between notebook sessions or - when styles need to be refreshed. - - Example: - >>> from datafusion.html_formatter import reset_styles_loaded_state - >>> reset_styles_loaded_state() # Force styles to reload in next render - """ - DataFrameHtmlFormatter._styles_loaded = False - - -def _refresh_formatter_reference() -> None: - """Refresh formatter reference in any modules using it. - - This helps ensure that changes to the formatter are reflected in existing - DataFrames that might be caching the formatter reference. - """ - # This is a no-op but signals modules to refresh their reference +warnings.warn( + "The module 'html_formatter' is deprecated and will be removed in the next release." + "Please use 'dataframe_formatter' instead.", + DeprecationWarning, + stacklevel=3, +) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index deaa30b3d..c9ae38d8e 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -37,14 +37,14 @@ from datafusion import ( functions as f, ) -from datafusion.expr import Window -from datafusion.html_formatter import ( +from datafusion.dataframe_formatter import ( DataFrameHtmlFormatter, configure_formatter, get_formatter, reset_formatter, reset_styles_loaded_state, ) +from datafusion.expr import Window from pyarrow.csv import write_csv MB = 1024 * 1024 diff --git a/src/dataframe.rs b/src/dataframe.rs index 3d68db279..c2ad4771e 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -24,6 +24,7 @@ use arrow::compute::can_cast_types; use arrow::error::ArrowError; use arrow::ffi::FFI_ArrowSchema; use arrow::ffi_stream::FFI_ArrowArrayStream; +use arrow::pyarrow::FromPyArrow; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; @@ -150,9 +151,9 @@ fn get_python_formatter_with_config(py: Python) -> PyResult { Ok(PythonFormatter { formatter, config }) } -/// Get the Python formatter from the datafusion.html_formatter module +/// Get the Python formatter from the datafusion.dataframe_formatter module fn import_python_formatter(py: Python) -> PyResult> { - let formatter_module = py.import("datafusion.html_formatter")?; + let formatter_module = py.import("datafusion.dataframe_formatter")?; let get_formatter = formatter_module.getattr("get_formatter")?; get_formatter.call0() } @@ -295,6 +296,46 @@ impl PyDataFrame { pub fn new(df: DataFrame) -> Self { Self { df: Arc::new(df) } } + + fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult { + // Get the Python formatter and config + let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), config), + )??; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); + } + + let table_uuid = uuid::Uuid::new_v4().to_string(); + + // Convert record batches to PyObject list + let py_batches = batches + .into_iter() + .map(|rb| rb.to_pyarrow(py)) + .collect::>>()?; + + let py_schema = self.schema().into_pyobject(py)?; + + let kwargs = pyo3::types::PyDict::new(py); + let py_batches_list = PyList::new(py, py_batches.as_slice())?; + kwargs.set_item("batches", py_batches_list)?; + kwargs.set_item("schema", py_schema)?; + kwargs.set_item("has_more", has_more)?; + kwargs.set_item("table_uuid", table_uuid)?; + + let method_name = match as_html { + true => "format_html", + false => "format_str", + }; + + let html_result = formatter.call_method(method_name, (), Some(&kwargs))?; + let html_str: String = html_result.extract()?; + + Ok(html_str) + } } #[pymethods] @@ -321,18 +362,27 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { - // Get the Python formatter config - let PythonFormatter { - formatter: _, - config, - } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; + self.prepare_repr_string(py, false) + } + + #[staticmethod] + #[expect(unused_variables)] + fn default_str_repr<'py>( + batches: Vec>, + schema: &Bound<'py, PyAny>, + has_more: bool, + table_uuid: &str, + ) -> PyResult { + let batches = batches + .into_iter() + .map(|batch| RecordBatch::from_pyarrow_bound(&batch)) + .collect::>>()? + .into_iter() + .filter(|batch| batch.num_rows() > 0) + .collect::>(); + if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); + return Ok("No data to display".to_owned()); } let batches_as_displ = @@ -347,38 +397,7 @@ impl PyDataFrame { } fn _repr_html_(&self, py: Python) -> PyDataFusionResult { - // Get the Python formatter and config - let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; - if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); - } - - let table_uuid = uuid::Uuid::new_v4().to_string(); - - // Convert record batches to PyObject list - let py_batches = batches - .into_iter() - .map(|rb| rb.to_pyarrow(py)) - .collect::>>()?; - - let py_schema = self.schema().into_pyobject(py)?; - - let kwargs = pyo3::types::PyDict::new(py); - let py_batches_list = PyList::new(py, py_batches.as_slice())?; - kwargs.set_item("batches", py_batches_list)?; - kwargs.set_item("schema", py_schema)?; - kwargs.set_item("has_more", has_more)?; - kwargs.set_item("table_uuid", table_uuid)?; - - let html_result = formatter.call_method("format_html", (), Some(&kwargs))?; - let html_str: String = html_result.extract()?; - - Ok(html_str) + self.prepare_repr_string(py, true) } /// Calculate summary statistics for a DataFrame From 954563429384078a9e85c56ad553c7e3be7ac52a Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 25 Jun 2025 11:29:35 -0400 Subject: [PATCH 147/248] feat: collect once during display() in jupyter notebooks (#1167) * Only collect one time during display() in jupyter notebooks * Check for juypter notebook environment specifically * Remove approach of checking environment which could not differentiate between jupyter console and notebook * Instead of trying to detect notebook vs console, collect one time when we have any kind if ipython environment. --- src/dataframe.rs | 36 ++++++++++++++++++++++++++---------- src/utils.rs | 11 +++++++++++ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index c2ad4771e..ab4749e35 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -51,7 +51,7 @@ use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; use crate::utils::{ - get_tokio_runtime, py_obj_to_scalar_value, validate_pycapsule, wait_for_future, + get_tokio_runtime, is_ipython_env, py_obj_to_scalar_value, validate_pycapsule, wait_for_future, }; use crate::{ errors::PyDataFusionResult, @@ -289,21 +289,33 @@ impl PyParquetColumnOptions { #[derive(Clone)] pub struct PyDataFrame { df: Arc, + + // In IPython environment cache batches between __repr__ and _repr_html_ calls. + batches: Option<(Vec, bool)>, } impl PyDataFrame { /// creates a new PyDataFrame pub fn new(df: DataFrame) -> Self { - Self { df: Arc::new(df) } + Self { + df: Arc::new(df), + batches: None, + } } - fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult { + fn prepare_repr_string(&mut self, py: Python, as_html: bool) -> PyDataFusionResult { // Get the Python formatter and config let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; + + let should_cache = *is_ipython_env(py) && self.batches.is_none(); + let (batches, has_more) = match self.batches.take() { + Some(b) => b, + None => wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), config), + )??, + }; + if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below return Ok("No data to display".to_string()); @@ -313,7 +325,7 @@ impl PyDataFrame { // Convert record batches to PyObject list let py_batches = batches - .into_iter() + .iter() .map(|rb| rb.to_pyarrow(py)) .collect::>>()?; @@ -334,6 +346,10 @@ impl PyDataFrame { let html_result = formatter.call_method(method_name, (), Some(&kwargs))?; let html_str: String = html_result.extract()?; + if should_cache { + self.batches = Some((batches, has_more)); + } + Ok(html_str) } } @@ -361,7 +377,7 @@ impl PyDataFrame { } } - fn __repr__(&self, py: Python) -> PyDataFusionResult { + fn __repr__(&mut self, py: Python) -> PyDataFusionResult { self.prepare_repr_string(py, false) } @@ -396,7 +412,7 @@ impl PyDataFrame { Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}")) } - fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + fn _repr_html_(&mut self, py: Python) -> PyDataFusionResult { self.prepare_repr_string(py, true) } diff --git a/src/utils.rs b/src/utils.rs index 90d654385..f4e121fd5 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -39,6 +39,17 @@ pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { RUNTIME.get_or_init(|| TokioRuntime(tokio::runtime::Runtime::new().unwrap())) } +#[inline] +pub(crate) fn is_ipython_env(py: Python) -> &'static bool { + static IS_IPYTHON_ENV: OnceLock = OnceLock::new(); + IS_IPYTHON_ENV.get_or_init(|| { + py.import("IPython") + .and_then(|ipython| ipython.call_method0("get_ipython")) + .map(|ipython| !ipython.is_none()) + .unwrap_or(false) + }) +} + /// Utility to get the Global Datafussion CTX #[inline] pub(crate) fn get_global_ctx() -> &'static SessionContext { From 9362f53150e5423581757ed56883b3ca2c95b8a2 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 2 Jul 2025 08:08:53 -0400 Subject: [PATCH 148/248] feat: python based catalog and schema provider (#1156) * Exposing FFI to python * Exposing FFI to python * Workin progress on python catalog * Flushing out schema and catalog providers * Adding implementation of python based catalog and schema providers * Small updates after rebase * Add default in memory options for adding schema and catalogs * Add support for creating in memory catalog and schema * Update from database to schema in unit tests * xfailed label no longer applies to these unit tests * Defining abstract methods for catalog and schema providers * Working through issues between custom catalog and build in schema * Check types on schema provider to return * Add docstring * Add documentation about how to use catalog and schema providers * Re-add module to all after rebase * Minor bugfix * Clippy updates from the new rust version --------- Co-authored-by: renato2099 --- Cargo.lock | 19 + Cargo.toml | 2 + docs/source/user-guide/data-sources.rst | 56 ++ examples/datafusion-ffi-example/Cargo.lock | 1 + examples/datafusion-ffi-example/Cargo.toml | 1 + .../python/tests/_test_catalog_provider.py | 60 +++ .../src/catalog_provider.rs | 179 +++++++ examples/datafusion-ffi-example/src/lib.rs | 3 + python/datafusion/__init__.py | 1 + python/datafusion/catalog.py | 195 ++++++- python/datafusion/context.py | 24 +- python/datafusion/dataframe.py | 9 +- python/tests/test_catalog.py | 173 ++++++- python/tests/test_context.py | 40 +- python/tests/test_sql.py | 30 +- python/tests/test_substrait.py | 4 +- src/catalog.rs | 490 ++++++++++++++++-- src/common/data_type.rs | 120 ++--- src/context.rs | 61 ++- src/expr.rs | 15 +- src/expr/aggregate.rs | 2 +- src/expr/aggregate_expr.rs | 2 +- src/expr/alias.rs | 2 +- src/expr/analyze.rs | 2 +- src/expr/between.rs | 2 +- src/expr/column.rs | 2 +- src/expr/copy_to.rs | 4 +- src/expr/create_catalog.rs | 2 +- src/expr/create_catalog_schema.rs | 2 +- src/expr/create_external_table.rs | 2 +- src/expr/create_function.rs | 2 +- src/expr/create_index.rs | 2 +- src/expr/create_memory_table.rs | 2 +- src/expr/create_view.rs | 2 +- src/expr/describe_table.rs | 2 +- src/expr/distinct.rs | 5 +- src/expr/drop_catalog_schema.rs | 2 +- src/expr/drop_function.rs | 2 +- src/expr/drop_table.rs | 2 +- src/expr/drop_view.rs | 2 +- src/expr/empty_relation.rs | 2 +- src/expr/filter.rs | 2 +- src/expr/join.rs | 2 +- src/expr/like.rs | 6 +- src/expr/limit.rs | 2 +- src/expr/projection.rs | 2 +- src/expr/recursive_query.rs | 2 +- src/expr/repartition.rs | 2 +- src/expr/sort.rs | 2 +- src/expr/sort_expr.rs | 2 +- src/expr/subquery.rs | 2 +- src/expr/subquery_alias.rs | 2 +- src/expr/table_scan.rs | 2 +- src/expr/union.rs | 2 +- src/expr/unnest.rs | 2 +- src/expr/unnest_expr.rs | 2 +- src/expr/window.rs | 11 +- src/functions.rs | 2 +- src/lib.rs | 10 +- src/physical_plan.rs | 3 +- src/sql/logical.rs | 3 +- src/utils.rs | 5 +- 62 files changed, 1340 insertions(+), 258 deletions(-) create mode 100644 examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py create mode 100644 examples/datafusion-ffi-example/src/catalog_provider.rs diff --git a/Cargo.lock b/Cargo.lock index 112167cb4..a3e9336cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -165,6 +165,12 @@ dependencies = [ "zstd", ] +[[package]] +name = "arc-swap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" + [[package]] name = "arrayref" version = "0.3.9" @@ -1503,6 +1509,7 @@ dependencies = [ "datafusion-proto", "datafusion-substrait", "futures", + "log", "mimalloc", "object_store", "prost", @@ -1510,6 +1517,7 @@ dependencies = [ "pyo3", "pyo3-async-runtimes", "pyo3-build-config", + "pyo3-log", "tokio", "url", "uuid", @@ -2953,6 +2961,17 @@ dependencies = [ "pyo3-build-config", ] +[[package]] +name = "pyo3-log" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45192e5e4a4d2505587e27806c7b710c231c40c56f3bfc19535d0bb25df52264" +dependencies = [ + "arc-swap", + "log", + "pyo3", +] + [[package]] name = "pyo3-macros" version = "0.24.2" diff --git a/Cargo.toml b/Cargo.toml index 4135e64e2..1f7895a50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ substrait = ["dep:datafusion-substrait"] tokio = { version = "1.45", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]} +pyo3-log = "0.12.4" arrow = { version = "55.1.0", features = ["pyarrow"] } datafusion = { version = "48.0.0", features = ["avro", "unicode_expressions"] } datafusion-substrait = { version = "48.0.0", optional = true } @@ -49,6 +50,7 @@ async-trait = "0.1.88" futures = "0.3" object_store = { version = "0.12.1", features = ["aws", "gcp", "azure", "http"] } url = "2" +log = "0.4.27" [build-dependencies] prost-types = "0.13.1" # keep in line with `datafusion-substrait` diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst index ba5967c97..9c95d58e0 100644 --- a/docs/source/user-guide/data-sources.rst +++ b/docs/source/user-guide/data-sources.rst @@ -185,3 +185,59 @@ the interface as describe in the :ref:`Custom Table Provider `_ is provided in the DataFusion repository. + +Catalog +======= + +A common technique for organizing tables is using a three level hierarchical approach. DataFusion +supports this form of organizing using the :py:class:`~datafusion.catalog.Catalog`, +:py:class:`~datafusion.catalog.Schema`, and :py:class:`~datafusion.catalog.Table`. By default, +a :py:class:`~datafusion.context.SessionContext` comes with a single Catalog and a single Schema +with the names ``datafusion`` and ``default``, respectively. + +The default implementation uses an in-memory approach to the catalog and schema. We have support +for adding additional in-memory catalogs and schemas. This can be done like in the following +example: + +.. code-block:: python + + from datafusion.catalog import Catalog, Schema + + my_catalog = Catalog.memory_catalog() + my_schema = Schema.memory_schema() + + my_catalog.register_schema("my_schema_name", my_schema) + + ctx.register_catalog("my_catalog_name", my_catalog) + +You could then register tables in ``my_schema`` and access them either through the DataFrame +API or via sql commands such as ``"SELECT * from my_catalog_name.my_schema_name.my_table"``. + +User Defined Catalog and Schema +------------------------------- + +If the in-memory catalogs are insufficient for your uses, there are two approaches you can take +to implementing a custom catalog and/or schema. In the below discussion, we describe how to +implement these for a Catalog, but the approach to implementing for a Schema is nearly +identical. + +DataFusion supports Catalogs written in either Rust or Python. If you write a Catalog in Rust, +you will need to export it as a Python library via PyO3. There is a complete example of a +catalog implemented this way in the +`examples folder `_ +of our repository. Writing catalog providers in Rust provides typically can lead to significant +performance improvements over the Python based approach. + +To implement a Catalog in Python, you will need to inherit from the abstract base class +:py:class:`~datafusion.catalog.CatalogProvider`. There are examples in the +`unit tests `_ of +implementing a basic Catalog in Python where we simply keep a dictionary of the +registered Schemas. + +One important note for developers is that when we have a Catalog defined in Python, we have +two different ways of accessing this Catalog. First, we register the catalog with a Rust +wrapper. This allows for any rust based code to call the Python functions as necessary. +Second, if the user access the Catalog via the Python API, we identify this and return back +the original Python object that implements the Catalog. This is an important distinction +for developers because we do *not* return a Python wrapper around the Rust wrapper of the +original Python object. diff --git a/examples/datafusion-ffi-example/Cargo.lock b/examples/datafusion-ffi-example/Cargo.lock index 075ebd5a1..e5a1ca8d1 100644 --- a/examples/datafusion-ffi-example/Cargo.lock +++ b/examples/datafusion-ffi-example/Cargo.lock @@ -1448,6 +1448,7 @@ dependencies = [ "arrow", "arrow-array", "arrow-schema", + "async-trait", "datafusion", "datafusion-ffi", "pyo3", diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml index 0e17567b9..319163554 100644 --- a/examples/datafusion-ffi-example/Cargo.toml +++ b/examples/datafusion-ffi-example/Cargo.toml @@ -27,6 +27,7 @@ pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] arrow = { version = "55.0.0" } arrow-array = { version = "55.0.0" } arrow-schema = { version = "55.0.0" } +async-trait = "0.1.88" [build-dependencies] pyo3-build-config = "0.23" diff --git a/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py b/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py new file mode 100644 index 000000000..72aadf64c --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext +from datafusion_ffi_example import MyCatalogProvider + + +def test_catalog_provider(): + ctx = SessionContext() + + my_catalog_name = "my_catalog" + expected_schema_name = "my_schema" + expected_table_name = "my_table" + expected_table_columns = ["units", "price"] + + catalog_provider = MyCatalogProvider() + ctx.register_catalog_provider(my_catalog_name, catalog_provider) + my_catalog = ctx.catalog(my_catalog_name) + + my_catalog_schemas = my_catalog.names() + assert expected_schema_name in my_catalog_schemas + my_database = my_catalog.database(expected_schema_name) + assert expected_table_name in my_database.names() + my_table = my_database.table(expected_table_name) + assert expected_table_columns == my_table.schema.names + + result = ctx.table( + f"{my_catalog_name}.{expected_schema_name}.{expected_table_name}" + ).collect() + assert len(result) == 2 + + col0_result = [r.column(0) for r in result] + col1_result = [r.column(1) for r in result] + expected_col0 = [ + pa.array([10, 20, 30], type=pa.int32()), + pa.array([5, 7], type=pa.int32()), + ] + expected_col1 = [ + pa.array([1, 2, 5], type=pa.float64()), + pa.array([1.5, 2.5], type=pa.float64()), + ] + assert col0_result == expected_col0 + assert col1_result == expected_col1 diff --git a/examples/datafusion-ffi-example/src/catalog_provider.rs b/examples/datafusion-ffi-example/src/catalog_provider.rs new file mode 100644 index 000000000..54e61cf3e --- /dev/null +++ b/examples/datafusion-ffi-example/src/catalog_provider.rs @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::{any::Any, fmt::Debug, sync::Arc}; + +use arrow::datatypes::Schema; +use async_trait::async_trait; +use datafusion::{ + catalog::{ + CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, TableProvider, + }, + common::exec_err, + datasource::MemTable, + error::{DataFusionError, Result}, +}; +use datafusion_ffi::catalog_provider::FFI_CatalogProvider; +use pyo3::types::PyCapsule; + +pub fn my_table() -> Arc { + use arrow::datatypes::{DataType, Field}; + use datafusion::common::record_batch; + + let schema = Arc::new(Schema::new(vec![ + Field::new("units", DataType::Int32, true), + Field::new("price", DataType::Float64, true), + ])); + + let partitions = vec![ + record_batch!( + ("units", Int32, vec![10, 20, 30]), + ("price", Float64, vec![1.0, 2.0, 5.0]) + ) + .unwrap(), + record_batch!( + ("units", Int32, vec![5, 7]), + ("price", Float64, vec![1.5, 2.5]) + ) + .unwrap(), + ]; + + Arc::new(MemTable::try_new(schema, vec![partitions]).unwrap()) +} + +#[derive(Debug)] +pub struct FixedSchemaProvider { + inner: MemorySchemaProvider, +} + +impl Default for FixedSchemaProvider { + fn default() -> Self { + let inner = MemorySchemaProvider::new(); + + let table = my_table(); + + let _ = inner.register_table("my_table".to_string(), table).unwrap(); + + Self { inner } + } +} + +#[async_trait] +impl SchemaProvider for FixedSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.inner.table_names() + } + + async fn table(&self, name: &str) -> Result>, DataFusionError> { + self.inner.table(name).await + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> Result>> { + self.inner.register_table(name, table) + } + + fn deregister_table(&self, name: &str) -> Result>> { + self.inner.deregister_table(name) + } + + fn table_exist(&self, name: &str) -> bool { + self.inner.table_exist(name) + } +} + +/// This catalog provider is intended only for unit tests. It prepopulates with one +/// schema and only allows for schemas named after four types of fruit. +#[pyclass( + name = "MyCatalogProvider", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Debug)] +pub(crate) struct MyCatalogProvider { + inner: MemoryCatalogProvider, +} + +impl Default for MyCatalogProvider { + fn default() -> Self { + let inner = MemoryCatalogProvider::new(); + + let schema_name: &str = "my_schema"; + let _ = inner.register_schema(schema_name, Arc::new(FixedSchemaProvider::default())); + + Self { inner } + } +} + +impl CatalogProvider for MyCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.inner.schema_names() + } + + fn schema(&self, name: &str) -> Option> { + self.inner.schema(name) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> Result>> { + self.inner.register_schema(name, schema) + } + + fn deregister_schema( + &self, + name: &str, + cascade: bool, + ) -> Result>> { + self.inner.deregister_schema(name, cascade) + } +} + +#[pymethods] +impl MyCatalogProvider { + #[new] + pub fn new() -> Self { + Self { + inner: Default::default(), + } + } + + pub fn __datafusion_catalog_provider__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_catalog_provider".into(); + let catalog_provider = + FFI_CatalogProvider::new(Arc::new(MyCatalogProvider::default()), None); + + PyCapsule::new(py, catalog_provider, Some(name)) + } +} diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index ae08c3b65..3a4cf2247 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. +use crate::catalog_provider::MyCatalogProvider; use crate::table_function::MyTableFunction; use crate::table_provider::MyTableProvider; use pyo3::prelude::*; +pub(crate) mod catalog_provider; pub(crate) mod table_function; pub(crate) mod table_provider; @@ -26,5 +28,6 @@ pub(crate) mod table_provider; fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index fd7f4fc06..e9d2dba75 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -92,6 +92,7 @@ "TableFunction", "WindowFrame", "WindowUDF", + "catalog", "col", "column", "common", diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 67ab3ead2..536b3a790 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -19,18 +19,33 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Protocol import datafusion._internal as df_internal if TYPE_CHECKING: import pyarrow as pa +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 + + +__all__ = [ + "Catalog", + "CatalogProvider", + "Schema", + "SchemaProvider", + "Table", +] + class Catalog: """DataFusion data catalog.""" - def __init__(self, catalog: df_internal.Catalog) -> None: + def __init__(self, catalog: df_internal.catalog.RawCatalog) -> None: """This constructor is not typically called by the end user.""" self.catalog = catalog @@ -38,39 +53,95 @@ def __repr__(self) -> str: """Print a string representation of the catalog.""" return self.catalog.__repr__() - def names(self) -> list[str]: - """Returns the list of databases in this catalog.""" - return self.catalog.names() + def names(self) -> set[str]: + """This is an alias for `schema_names`.""" + return self.schema_names() + + def schema_names(self) -> set[str]: + """Returns the list of schemas in this catalog.""" + return self.catalog.schema_names() + + @staticmethod + def memory_catalog() -> Catalog: + """Create an in-memory catalog provider.""" + catalog = df_internal.catalog.RawCatalog.memory_catalog() + return Catalog(catalog) - def database(self, name: str = "public") -> Database: + def schema(self, name: str = "public") -> Schema: """Returns the database with the given ``name`` from this catalog.""" - return Database(self.catalog.database(name)) + schema = self.catalog.schema(name) + + return ( + Schema(schema) + if isinstance(schema, df_internal.catalog.RawSchema) + else schema + ) + + @deprecated("Use `schema` instead.") + def database(self, name: str = "public") -> Schema: + """Returns the database with the given ``name`` from this catalog.""" + return self.schema(name) + + def register_schema(self, name, schema) -> Schema | None: + """Register a schema with this catalog.""" + if isinstance(schema, Schema): + return self.catalog.register_schema(name, schema._raw_schema) + return self.catalog.register_schema(name, schema) + + def deregister_schema(self, name: str, cascade: bool = True) -> Schema | None: + """Deregister a schema from this catalog.""" + return self.catalog.deregister_schema(name, cascade) -class Database: - """DataFusion Database.""" +class Schema: + """DataFusion Schema.""" - def __init__(self, db: df_internal.Database) -> None: + def __init__(self, schema: df_internal.catalog.RawSchema) -> None: """This constructor is not typically called by the end user.""" - self.db = db + self._raw_schema = schema def __repr__(self) -> str: - """Print a string representation of the database.""" - return self.db.__repr__() + """Print a string representation of the schema.""" + return self._raw_schema.__repr__() + + @staticmethod + def memory_schema() -> Schema: + """Create an in-memory schema provider.""" + schema = df_internal.catalog.RawSchema.memory_schema() + return Schema(schema) def names(self) -> set[str]: - """Returns the list of all tables in this database.""" - return self.db.names() + """This is an alias for `table_names`.""" + return self.table_names() + + def table_names(self) -> set[str]: + """Returns the list of all tables in this schema.""" + return self._raw_schema.table_names def table(self, name: str) -> Table: - """Return the table with the given ``name`` from this database.""" - return Table(self.db.table(name)) + """Return the table with the given ``name`` from this schema.""" + return Table(self._raw_schema.table(name)) + + def register_table(self, name, table) -> None: + """Register a table provider in this schema.""" + if isinstance(table, Table): + return self._raw_schema.register_table(name, table.table) + return self._raw_schema.register_table(name, table) + + def deregister_table(self, name: str) -> None: + """Deregister a table provider from this schema.""" + return self._raw_schema.deregister_table(name) + + +@deprecated("Use `Schema` instead.") +class Database(Schema): + """See `Schema`.""" class Table: """DataFusion table.""" - def __init__(self, table: df_internal.Table) -> None: + def __init__(self, table: df_internal.catalog.RawTable) -> None: """This constructor is not typically called by the end user.""" self.table = table @@ -78,6 +149,11 @@ def __repr__(self) -> str: """Print a string representation of the table.""" return self.table.__repr__() + @staticmethod + def from_dataset(dataset: pa.dataset.Dataset) -> Table: + """Turn a pyarrow Dataset into a Table.""" + return Table(df_internal.catalog.RawTable.from_dataset(dataset)) + @property def schema(self) -> pa.Schema: """Returns the schema associated with this table.""" @@ -87,3 +163,86 @@ def schema(self) -> pa.Schema: def kind(self) -> str: """Returns the kind of table.""" return self.table.kind + + +class CatalogProvider(ABC): + """Abstract class for defining a Python based Catalog Provider.""" + + @abstractmethod + def schema_names(self) -> set[str]: + """Set of the names of all schemas in this catalog.""" + ... + + @abstractmethod + def schema(self, name: str) -> Schema | None: + """Retrieve a specific schema from this catalog.""" + ... + + def register_schema( # noqa: B027 + self, name: str, schema: SchemaProviderExportable | SchemaProvider | Schema + ) -> None: + """Add a schema to this catalog. + + This method is optional. If your catalog provides a fixed list of schemas, you + do not need to implement this method. + """ + + def deregister_schema(self, name: str, cascade: bool) -> None: # noqa: B027 + """Remove a schema from this catalog. + + This method is optional. If your catalog provides a fixed list of schemas, you + do not need to implement this method. + + Args: + name: The name of the schema to remove. + cascade: If true, deregister the tables within the schema. + """ + + +class SchemaProvider(ABC): + """Abstract class for defining a Python based Schema Provider.""" + + def owner_name(self) -> str | None: + """Returns the owner of the schema. + + This is an optional method. The default return is None. + """ + return None + + @abstractmethod + def table_names(self) -> set[str]: + """Set of the names of all tables in this schema.""" + ... + + @abstractmethod + def table(self, name: str) -> Table | None: + """Retrieve a specific table from this schema.""" + ... + + def register_table(self, name: str, table: Table) -> None: # noqa: B027 + """Add a table from this schema. + + This method is optional. If your schema provides a fixed list of tables, you do + not need to implement this method. + """ + + def deregister_table(self, name, cascade: bool) -> None: # noqa: B027 + """Remove a table from this schema. + + This method is optional. If your schema provides a fixed list of tables, you do + not need to implement this method. + """ + + @abstractmethod + def table_exist(self, name: str) -> bool: + """Returns true if the table exists in this schema.""" + ... + + +class SchemaProviderExportable(Protocol): + """Type hint for object that has __datafusion_schema_provider__ PyCapsule. + + https://docs.rs/datafusion/latest/datafusion/catalog/trait.SchemaProvider.html + """ + + def __datafusion_schema_provider__(self) -> object: ... diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 5b99b0d26..bce51d644 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -29,7 +29,7 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 -from datafusion.catalog import Catalog, Table +from datafusion.catalog import Catalog, CatalogProvider, Table from datafusion.dataframe import DataFrame from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list from datafusion.record_batch import RecordBatchStream @@ -80,6 +80,15 @@ class TableProviderExportable(Protocol): def __datafusion_table_provider__(self) -> object: ... # noqa: D105 +class CatalogProviderExportable(Protocol): + """Type hint for object that has __datafusion_catalog_provider__ PyCapsule. + + https://docs.rs/datafusion/latest/datafusion/catalog/trait.CatalogProvider.html + """ + + def __datafusion_catalog_provider__(self) -> object: ... # noqa: D105 + + class SessionConfig: """Session configuration options.""" @@ -749,6 +758,19 @@ def deregister_table(self, name: str) -> None: """Remove a table from the session.""" self.ctx.deregister_table(name) + def catalog_names(self) -> set[str]: + """Returns the list of catalogs in this context.""" + return self.ctx.catalog_names() + + def register_catalog_provider( + self, name: str, provider: CatalogProviderExportable | CatalogProvider | Catalog + ) -> None: + """Register a catalog provider.""" + if isinstance(provider, Catalog): + self.ctx.register_catalog_provider(name, provider.catalog) + else: + self.ctx.register_catalog_provider(name, provider) + def register_table_provider( self, name: str, provider: TableProviderExportable ) -> None: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 991e6875a..61cb09438 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -760,19 +760,16 @@ def join_on( exprs = [expr.expr for expr in on_exprs] return DataFrame(self.df.join_on(right.df, exprs, how)) - def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: - """Return a DataFrame with the explanation of its plan so far. + def explain(self, verbose: bool = False, analyze: bool = False) -> None: + """Print an explanation of the DataFrame's plan so far. If ``analyze`` is specified, runs the plan and reports metrics. Args: verbose: If ``True``, more details will be included. analyze: If ``Tru`e``, the plan will run and metrics reported. - - Returns: - DataFrame with the explanation of its plan. """ - return DataFrame(self.df.explain(verbose, analyze)) + self.df.explain(verbose, analyze) def logical_plan(self) -> LogicalPlan: """Return the unoptimized ``LogicalPlan``. diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py index 23b328458..1f9ecbfc3 100644 --- a/python/tests/test_catalog.py +++ b/python/tests/test_catalog.py @@ -14,9 +14,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations +import datafusion as dfn import pyarrow as pa +import pyarrow.dataset as ds import pytest +from datafusion import SessionContext, Table # Note we take in `database` as a variable even though we don't use @@ -27,9 +31,9 @@ def test_basic(ctx, database): ctx.catalog("non-existent") default = ctx.catalog() - assert default.names() == ["public"] + assert default.names() == {"public"} - for db in [default.database("public"), default.database()]: + for db in [default.schema("public"), default.schema()]: assert db.names() == {"csv1", "csv", "csv2"} table = db.table("csv") @@ -41,3 +45,168 @@ def test_basic(ctx, database): pa.field("float", pa.float64(), nullable=True), ] ) + + +def create_dataset() -> Table: + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + dataset = ds.dataset([batch]) + return Table.from_dataset(dataset) + + +class CustomSchemaProvider(dfn.catalog.SchemaProvider): + def __init__(self): + self.tables = {"table1": create_dataset()} + + def table_names(self) -> set[str]: + return set(self.tables.keys()) + + def register_table(self, name: str, table: Table): + self.tables[name] = table + + def deregister_table(self, name, cascade: bool = True): + del self.tables[name] + + def table(self, name: str) -> Table | None: + return self.tables[name] + + def table_exist(self, name: str) -> bool: + return name in self.tables + + +class CustomCatalogProvider(dfn.catalog.CatalogProvider): + def __init__(self): + self.schemas = {"my_schema": CustomSchemaProvider()} + + def schema_names(self) -> set[str]: + return set(self.schemas.keys()) + + def schema(self, name: str): + return self.schemas[name] + + def register_schema(self, name: str, schema: dfn.catalog.Schema): + self.schemas[name] = schema + + def deregister_schema(self, name, cascade: bool): + del self.schemas[name] + + +def test_python_catalog_provider(ctx: SessionContext): + ctx.register_catalog_provider("my_catalog", CustomCatalogProvider()) + + # Check the default catalog provider + assert ctx.catalog("datafusion").names() == {"public"} + + my_catalog = ctx.catalog("my_catalog") + assert my_catalog.names() == {"my_schema"} + + my_catalog.register_schema("second_schema", CustomSchemaProvider()) + assert my_catalog.schema_names() == {"my_schema", "second_schema"} + + my_catalog.deregister_schema("my_schema") + assert my_catalog.schema_names() == {"second_schema"} + + +def test_in_memory_providers(ctx: SessionContext): + catalog = dfn.catalog.Catalog.memory_catalog() + ctx.register_catalog_provider("in_mem_catalog", catalog) + + assert ctx.catalog_names() == {"datafusion", "in_mem_catalog"} + + schema = dfn.catalog.Schema.memory_schema() + catalog.register_schema("in_mem_schema", schema) + + schema.register_table("my_table", create_dataset()) + + batches = ctx.sql("select * from in_mem_catalog.in_mem_schema.my_table").collect() + + assert len(batches) == 1 + assert batches[0].column(0) == pa.array([1, 2, 3]) + assert batches[0].column(1) == pa.array([4, 5, 6]) + + +def test_python_schema_provider(ctx: SessionContext): + catalog = ctx.catalog() + + catalog.deregister_schema("public") + + catalog.register_schema("test_schema1", CustomSchemaProvider()) + assert catalog.names() == {"test_schema1"} + + catalog.register_schema("test_schema2", CustomSchemaProvider()) + catalog.deregister_schema("test_schema1") + assert catalog.names() == {"test_schema2"} + + +def test_python_table_provider(ctx: SessionContext): + catalog = ctx.catalog() + + catalog.register_schema("custom_schema", CustomSchemaProvider()) + schema = catalog.schema("custom_schema") + + assert schema.table_names() == {"table1"} + + schema.deregister_table("table1") + schema.register_table("table2", create_dataset()) + assert schema.table_names() == {"table2"} + + # Use the default schema instead of our custom schema + + schema = catalog.schema() + + schema.register_table("table3", create_dataset()) + assert schema.table_names() == {"table3"} + + schema.deregister_table("table3") + schema.register_table("table4", create_dataset()) + assert schema.table_names() == {"table4"} + + +def test_in_end_to_end_python_providers(ctx: SessionContext): + """Test registering all python providers and running a query against them.""" + + all_catalog_names = [ + "datafusion", + "custom_catalog", + "in_mem_catalog", + ] + + all_schema_names = [ + "custom_schema", + "in_mem_schema", + ] + + ctx.register_catalog_provider(all_catalog_names[1], CustomCatalogProvider()) + ctx.register_catalog_provider( + all_catalog_names[2], dfn.catalog.Catalog.memory_catalog() + ) + + for catalog_name in all_catalog_names: + catalog = ctx.catalog(catalog_name) + + # Clean out previous schemas if they exist so we can start clean + for schema_name in catalog.schema_names(): + catalog.deregister_schema(schema_name, cascade=False) + + catalog.register_schema(all_schema_names[0], CustomSchemaProvider()) + catalog.register_schema(all_schema_names[1], dfn.catalog.Schema.memory_schema()) + + for schema_name in all_schema_names: + schema = catalog.schema(schema_name) + + for table_name in schema.table_names(): + schema.deregister_table(table_name) + + schema.register_table("test_table", create_dataset()) + + for catalog_name in all_catalog_names: + for schema_name in all_schema_names: + table_full_name = f"{catalog_name}.{schema_name}.test_table" + + batches = ctx.sql(f"select * from {table_full_name}").collect() + + assert len(batches) == 1 + assert batches[0].column(0) == pa.array([1, 2, 3]) + assert batches[0].column(1) == pa.array([4, 5, 6]) diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 4a15ac9cf..6dbcc0d5e 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -57,7 +57,7 @@ def test_runtime_configs(tmp_path, path_to_str): ctx = SessionContext(config, runtime) assert ctx is not None - db = ctx.catalog("foo").database("bar") + db = ctx.catalog("foo").schema("bar") assert db is not None @@ -70,7 +70,7 @@ def test_temporary_files(tmp_path, path_to_str): ctx = SessionContext(config, runtime) assert ctx is not None - db = ctx.catalog("foo").database("bar") + db = ctx.catalog("foo").schema("bar") assert db is not None @@ -91,7 +91,7 @@ def test_create_context_with_all_valid_args(): ctx = SessionContext(config, runtime) # verify that at least some of the arguments worked - ctx.catalog("foo").database("bar") + ctx.catalog("foo").schema("bar") with pytest.raises(KeyError): ctx.catalog("datafusion") @@ -105,7 +105,7 @@ def test_register_record_batches(ctx): ctx.register_record_batches("t", [[batch]]) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} result = ctx.sql("SELECT a+b, a-b FROM t").collect() @@ -121,7 +121,7 @@ def test_create_dataframe_registers_unique_table_name(ctx): ) df = ctx.create_dataframe([[batch]]) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -141,7 +141,7 @@ def test_create_dataframe_registers_with_defined_table_name(ctx): ) df = ctx.create_dataframe([[batch]], name="tbl") - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -155,7 +155,7 @@ def test_from_arrow_table(ctx): # convert to DataFrame df = ctx.from_arrow(table) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -200,7 +200,7 @@ def test_from_arrow_table_with_name(ctx): # convert to DataFrame with optional name df = ctx.from_arrow(table, name="tbl") - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert tables[0] == "tbl" @@ -213,7 +213,7 @@ def test_from_arrow_table_empty(ctx): # convert to DataFrame df = ctx.from_arrow(table) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -228,7 +228,7 @@ def test_from_arrow_table_empty_no_schema(ctx): # convert to DataFrame df = ctx.from_arrow(table) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -246,7 +246,7 @@ def test_from_pylist(ctx): ] df = ctx.from_pylist(data) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -260,7 +260,7 @@ def test_from_pydict(ctx): data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = ctx.from_pydict(data) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -276,7 +276,7 @@ def test_from_pandas(ctx): pandas_df = pd.DataFrame(data) df = ctx.from_pandas(pandas_df) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -292,7 +292,7 @@ def test_from_polars(ctx): polars_df = pd.DataFrame(data) df = ctx.from_polars(polars_df) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -303,7 +303,7 @@ def test_from_polars(ctx): def test_register_table(ctx, database): default = ctx.catalog() - public = default.database("public") + public = default.schema("public") assert public.names() == {"csv", "csv1", "csv2"} table = public.table("csv") @@ -313,7 +313,7 @@ def test_register_table(ctx, database): def test_read_table(ctx, database): default = ctx.catalog() - public = default.database("public") + public = default.schema("public") assert public.names() == {"csv", "csv1", "csv2"} table = public.table("csv") @@ -323,7 +323,7 @@ def test_read_table(ctx, database): def test_deregister_table(ctx, database): default = ctx.catalog() - public = default.database("public") + public = default.schema("public") assert public.names() == {"csv", "csv1", "csv2"} ctx.deregister_table("csv") @@ -339,7 +339,7 @@ def test_register_dataset(ctx): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} result = ctx.sql("SELECT a+b, a-b FROM t").collect() @@ -356,7 +356,7 @@ def test_dataset_filter(ctx, capfd): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} df = ctx.sql("SELECT a+b, a-b FROM t WHERE a BETWEEN 2 and 3 AND b > 5") # Make sure the filter was pushed down in Physical Plan @@ -455,7 +455,7 @@ def test_dataset_filter_nested_data(ctx): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} df = ctx.table("t") diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 41cee4ef3..c383edc60 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -75,7 +75,7 @@ def test_register_csv(ctx, tmp_path): ) ctx.register_csv("csv3", path, schema=alternative_schema) - assert ctx.catalog().database().names() == { + assert ctx.catalog().schema().names() == { "csv", "csv1", "csv2", @@ -150,7 +150,7 @@ def test_register_parquet(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) ctx.register_parquet("t", path) ctx.register_parquet("t1", str(path)) - assert ctx.catalog().database().names() == {"t", "t1"} + assert ctx.catalog().schema().names() == {"t", "t1"} result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() result = pa.Table.from_batches(result) @@ -188,7 +188,7 @@ def test_register_parquet_partitioned(ctx, tmp_path, path_to_str, legacy_data_ty parquet_pruning=True, file_extension=".parquet", ) - assert ctx.catalog().database().names() == {"datapp"} + assert ctx.catalog().schema().names() == {"datapp"} result = ctx.sql("SELECT grp, COUNT(*) AS cnt FROM datapp GROUP BY grp").collect() result = pa.Table.from_batches(result) @@ -204,7 +204,7 @@ def test_register_dataset(ctx, tmp_path, path_to_str): dataset = ds.dataset(path, format="parquet") ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() result = pa.Table.from_batches(result) @@ -251,7 +251,7 @@ def test_register_json(ctx, tmp_path): ) ctx.register_json("json3", path, schema=alternative_schema) - assert ctx.catalog().database().names() == { + assert ctx.catalog().schema().names() == { "json", "json1", "json2", @@ -308,7 +308,7 @@ def test_execute(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", pa.array(data)) ctx.register_parquet("t", path) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} # count result = ctx.sql("SELECT COUNT(a) AS cnt FROM t WHERE a IS NOT NULL").collect() @@ -451,18 +451,10 @@ def test_udf( id="datetime_ns", ), # Not writtable to parquet - pytest.param( - helpers.data_timedelta("s"), id="timedelta_s", marks=pytest.mark.xfail - ), - pytest.param( - helpers.data_timedelta("ms"), id="timedelta_ms", marks=pytest.mark.xfail - ), - pytest.param( - helpers.data_timedelta("us"), id="timedelta_us", marks=pytest.mark.xfail - ), - pytest.param( - helpers.data_timedelta("ns"), id="timedelta_ns", marks=pytest.mark.xfail - ), + pytest.param(helpers.data_timedelta("s"), id="timedelta_s"), + pytest.param(helpers.data_timedelta("ms"), id="timedelta_ms"), + pytest.param(helpers.data_timedelta("us"), id="timedelta_us"), + pytest.param(helpers.data_timedelta("ns"), id="timedelta_ns"), ], ) def test_simple_select(ctx, tmp_path, arr): @@ -524,7 +516,7 @@ def test_register_listing_table( schema=table.schema if pass_schema else None, file_sort_order=file_sort_order, ) - assert ctx.catalog().database().names() == {"my_table"} + assert ctx.catalog().schema().names() == {"my_table"} result = ctx.sql( "SELECT grp, COUNT(*) AS count FROM my_table GROUP BY grp" diff --git a/python/tests/test_substrait.py b/python/tests/test_substrait.py index f367a447d..43aa327d4 100644 --- a/python/tests/test_substrait.py +++ b/python/tests/test_substrait.py @@ -34,7 +34,7 @@ def test_substrait_serialization(ctx): ctx.register_record_batches("t", [[batch]]) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} # For now just make sure the method calls blow up substrait_plan = ss.Serde.serialize_to_plan("SELECT * FROM t", ctx) @@ -59,7 +59,7 @@ def test_substrait_file_serialization(ctx, tmp_path, path_to_str): ctx.register_record_batches("t", [[batch]]) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} path = tmp_path / "substrait_plan" path = str(path) if path_to_str else path diff --git a/src/catalog.rs b/src/catalog.rs index 83f8d08cb..17d4ec3b8 100644 --- a/src/catalog.rs +++ b/src/catalog.rs @@ -15,44 +15,54 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashSet; -use std::sync::Arc; - -use pyo3::exceptions::PyKeyError; -use pyo3::prelude::*; - -use crate::errors::{PyDataFusionError, PyDataFusionResult}; -use crate::utils::wait_for_future; +use crate::dataset::Dataset; +use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionError, PyDataFusionResult}; +use crate::utils::{validate_pycapsule, wait_for_future}; +use async_trait::async_trait; +use datafusion::catalog::{MemoryCatalogProvider, MemorySchemaProvider}; +use datafusion::common::DataFusionError; use datafusion::{ arrow::pyarrow::ToPyArrow, catalog::{CatalogProvider, SchemaProvider}, datasource::{TableProvider, TableType}, }; +use datafusion_ffi::schema_provider::{FFI_SchemaProvider, ForeignSchemaProvider}; +use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; +use pyo3::exceptions::PyKeyError; +use pyo3::prelude::*; +use pyo3::types::PyCapsule; +use pyo3::IntoPyObjectExt; +use std::any::Any; +use std::collections::HashSet; +use std::sync::Arc; -#[pyclass(name = "Catalog", module = "datafusion", subclass)] +#[pyclass(name = "RawCatalog", module = "datafusion.catalog", subclass)] +#[derive(Clone)] pub struct PyCatalog { pub catalog: Arc, } -#[pyclass(name = "Database", module = "datafusion", subclass)] -pub struct PyDatabase { - pub database: Arc, +#[pyclass(name = "RawSchema", module = "datafusion.catalog", subclass)] +#[derive(Clone)] +pub struct PySchema { + pub schema: Arc, } -#[pyclass(name = "Table", module = "datafusion", subclass)] +#[pyclass(name = "RawTable", module = "datafusion.catalog", subclass)] +#[derive(Clone)] pub struct PyTable { pub table: Arc, } -impl PyCatalog { - pub fn new(catalog: Arc) -> Self { +impl From> for PyCatalog { + fn from(catalog: Arc) -> Self { Self { catalog } } } -impl PyDatabase { - pub fn new(database: Arc) -> Self { - Self { database } +impl From> for PySchema { + fn from(schema: Arc) -> Self { + Self { schema } } } @@ -68,36 +78,109 @@ impl PyTable { #[pymethods] impl PyCatalog { - fn names(&self) -> Vec { - self.catalog.schema_names() + #[new] + fn new(catalog: PyObject) -> Self { + let catalog_provider = + Arc::new(RustWrappedPyCatalogProvider::new(catalog)) as Arc; + catalog_provider.into() + } + + #[staticmethod] + fn memory_catalog() -> Self { + let catalog_provider = + Arc::new(MemoryCatalogProvider::default()) as Arc; + catalog_provider.into() + } + + fn schema_names(&self) -> HashSet { + self.catalog.schema_names().into_iter().collect() } #[pyo3(signature = (name="public"))] - fn database(&self, name: &str) -> PyResult { - match self.catalog.schema(name) { - Some(database) => Ok(PyDatabase::new(database)), - None => Err(PyKeyError::new_err(format!( - "Database with name {name} doesn't exist." - ))), - } + fn schema(&self, name: &str) -> PyResult { + let schema = self + .catalog + .schema(name) + .ok_or(PyKeyError::new_err(format!( + "Schema with name {name} doesn't exist." + )))?; + + Python::with_gil(|py| { + match schema + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => Ok(wrapped_schema.schema_provider.clone_ref(py)), + None => PySchema::from(schema).into_py_any(py), + } + }) + } + + fn register_schema(&self, name: &str, schema_provider: Bound<'_, PyAny>) -> PyResult<()> { + let provider = if schema_provider.hasattr("__datafusion_schema_provider__")? { + let capsule = schema_provider + .getattr("__datafusion_schema_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_schema_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignSchemaProvider = provider.into(); + Arc::new(provider) as Arc + } else { + match schema_provider.extract::() { + Ok(py_schema) => py_schema.schema, + Err(_) => Arc::new(RustWrappedPySchemaProvider::new(schema_provider.into())) + as Arc, + } + }; + + let _ = self + .catalog + .register_schema(name, provider) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + fn deregister_schema(&self, name: &str, cascade: bool) -> PyResult<()> { + let _ = self + .catalog + .deregister_schema(name, cascade) + .map_err(py_datafusion_err)?; + + Ok(()) } fn __repr__(&self) -> PyResult { - Ok(format!( - "Catalog(schema_names=[{}])", - self.names().join(";") - )) + let mut names: Vec = self.schema_names().into_iter().collect(); + names.sort(); + Ok(format!("Catalog(schema_names=[{}])", names.join(", "))) } } #[pymethods] -impl PyDatabase { - fn names(&self) -> HashSet { - self.database.table_names().into_iter().collect() +impl PySchema { + #[new] + fn new(schema_provider: PyObject) -> Self { + let schema_provider = + Arc::new(RustWrappedPySchemaProvider::new(schema_provider)) as Arc; + schema_provider.into() + } + + #[staticmethod] + fn memory_schema() -> Self { + let schema_provider = Arc::new(MemorySchemaProvider::default()) as Arc; + schema_provider.into() + } + + #[getter] + fn table_names(&self) -> HashSet { + self.schema.table_names().into_iter().collect() } fn table(&self, name: &str, py: Python) -> PyDataFusionResult { - if let Some(table) = wait_for_future(py, self.database.table(name))?? { + if let Some(table) = wait_for_future(py, self.schema.table(name))?? { Ok(PyTable::new(table)) } else { Err(PyDataFusionError::Common(format!( @@ -107,14 +190,49 @@ impl PyDatabase { } fn __repr__(&self) -> PyResult { - Ok(format!( - "Database(table_names=[{}])", - Vec::from_iter(self.names()).join(";") - )) + let mut names: Vec = self.table_names().into_iter().collect(); + names.sort(); + Ok(format!("Schema(table_names=[{}])", names.join(";"))) } - // register_table - // deregister_table + fn register_table(&self, name: &str, table_provider: Bound<'_, PyAny>) -> PyResult<()> { + let provider = if table_provider.hasattr("__datafusion_table_provider__")? { + let capsule = table_provider + .getattr("__datafusion_table_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_table_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignTableProvider = provider.into(); + Arc::new(provider) as Arc + } else { + match table_provider.extract::() { + Ok(py_table) => py_table.table, + Err(_) => { + let py = table_provider.py(); + let provider = Dataset::new(&table_provider, py)?; + Arc::new(provider) as Arc + } + } + }; + + let _ = self + .schema + .register_table(name.to_string(), provider) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + fn deregister_table(&self, name: &str) -> PyResult<()> { + let _ = self + .schema + .deregister_table(name) + .map_err(py_datafusion_err)?; + + Ok(()) + } } #[pymethods] @@ -125,6 +243,14 @@ impl PyTable { self.table.schema().to_pyarrow(py) } + #[staticmethod] + fn from_dataset(py: Python<'_>, dataset: &Bound<'_, PyAny>) -> PyResult { + let ds = Arc::new(Dataset::new(dataset, py).map_err(py_datafusion_err)?) + as Arc; + + Ok(Self::new(ds)) + } + /// Get the type of this table for metadata/catalog purposes. #[getter] fn kind(&self) -> &str { @@ -145,3 +271,285 @@ impl PyTable { // fn has_exact_statistics // fn supports_filter_pushdown } + +#[derive(Debug)] +pub(crate) struct RustWrappedPySchemaProvider { + schema_provider: PyObject, + owner_name: Option, +} + +impl RustWrappedPySchemaProvider { + pub fn new(schema_provider: PyObject) -> Self { + let owner_name = Python::with_gil(|py| { + schema_provider + .bind(py) + .getattr("owner_name") + .ok() + .map(|name| name.to_string()) + }); + + Self { + schema_provider, + owner_name, + } + } + + fn table_inner(&self, name: &str) -> PyResult>> { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + let py_table_method = provider.getattr("table")?; + + let py_table = py_table_method.call((name,), None)?; + if py_table.is_none() { + return Ok(None); + } + + if py_table.hasattr("__datafusion_table_provider__")? { + let capsule = provider.getattr("__datafusion_table_provider__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_table_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignTableProvider = provider.into(); + + Ok(Some(Arc::new(provider) as Arc)) + } else { + if let Ok(inner_table) = py_table.getattr("table") { + if let Ok(inner_table) = inner_table.extract::() { + return Ok(Some(inner_table.table)); + } + } + + match py_table.extract::() { + Ok(py_table) => Ok(Some(py_table.table)), + Err(_) => { + let ds = Dataset::new(&py_table, py).map_err(py_datafusion_err)?; + Ok(Some(Arc::new(ds) as Arc)) + } + } + } + }) + } +} + +#[async_trait] +impl SchemaProvider for RustWrappedPySchemaProvider { + fn owner_name(&self) -> Option<&str> { + self.owner_name.as_deref() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + + provider + .getattr("table_names") + .and_then(|names| names.extract::>()) + .unwrap_or_else(|err| { + log::error!("Unable to get table_names: {err}"); + Vec::default() + }) + }) + } + + async fn table( + &self, + name: &str, + ) -> datafusion::common::Result>, DataFusionError> { + self.table_inner(name).map_err(to_datafusion_err) + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> datafusion::common::Result>> { + let py_table = PyTable::new(table); + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + let _ = provider + .call_method1("register_table", (name, py_table)) + .map_err(to_datafusion_err)?; + // Since the definition of `register_table` says that an error + // will be returned if the table already exists, there is no + // case where we want to return a table provider as output. + Ok(None) + }) + } + + fn deregister_table( + &self, + name: &str, + ) -> datafusion::common::Result>> { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + let table = provider + .call_method1("deregister_table", (name,)) + .map_err(to_datafusion_err)?; + if table.is_none() { + return Ok(None); + } + + // If we can turn this table provider into a `Dataset`, return it. + // Otherwise, return None. + let dataset = match Dataset::new(&table, py) { + Ok(dataset) => Some(Arc::new(dataset) as Arc), + Err(_) => None, + }; + + Ok(dataset) + }) + } + + fn table_exist(&self, name: &str) -> bool { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + provider + .call_method1("table_exist", (name,)) + .and_then(|pyobj| pyobj.extract()) + .unwrap_or(false) + }) + } +} + +#[derive(Debug)] +pub(crate) struct RustWrappedPyCatalogProvider { + pub(crate) catalog_provider: PyObject, +} + +impl RustWrappedPyCatalogProvider { + pub fn new(catalog_provider: PyObject) -> Self { + Self { catalog_provider } + } + + fn schema_inner(&self, name: &str) -> PyResult>> { + Python::with_gil(|py| { + let provider = self.catalog_provider.bind(py); + + let py_schema = provider.call_method1("schema", (name,))?; + if py_schema.is_none() { + return Ok(None); + } + + if py_schema.hasattr("__datafusion_schema_provider__")? { + let capsule = provider + .getattr("__datafusion_schema_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_schema_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignSchemaProvider = provider.into(); + + Ok(Some(Arc::new(provider) as Arc)) + } else { + if let Ok(inner_schema) = py_schema.getattr("schema") { + if let Ok(inner_schema) = inner_schema.extract::() { + return Ok(Some(inner_schema.schema)); + } + } + match py_schema.extract::() { + Ok(inner_schema) => Ok(Some(inner_schema.schema)), + Err(_) => { + let py_schema = RustWrappedPySchemaProvider::new(py_schema.into()); + + Ok(Some(Arc::new(py_schema) as Arc)) + } + } + } + }) + } +} + +#[async_trait] +impl CatalogProvider for RustWrappedPyCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + Python::with_gil(|py| { + let provider = self.catalog_provider.bind(py); + provider + .getattr("schema_names") + .and_then(|names| names.extract::>()) + .unwrap_or_else(|err| { + log::error!("Unable to get schema_names: {err}"); + Vec::default() + }) + }) + } + + fn schema(&self, name: &str) -> Option> { + self.schema_inner(name).unwrap_or_else(|err| { + log::error!("CatalogProvider schema returned error: {err}"); + None + }) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> datafusion::common::Result>> { + // JRIGHT HERE + // let py_schema: PySchema = schema.into(); + Python::with_gil(|py| { + let py_schema = match schema + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => wrapped_schema.schema_provider.as_any(), + None => &PySchema::from(schema) + .into_py_any(py) + .map_err(to_datafusion_err)?, + }; + + let provider = self.catalog_provider.bind(py); + let schema = provider + .call_method1("register_schema", (name, py_schema)) + .map_err(to_datafusion_err)?; + if schema.is_none() { + return Ok(None); + } + + let schema = Arc::new(RustWrappedPySchemaProvider::new(schema.into())) + as Arc; + + Ok(Some(schema)) + }) + } + + fn deregister_schema( + &self, + name: &str, + cascade: bool, + ) -> datafusion::common::Result>> { + Python::with_gil(|py| { + let provider = self.catalog_provider.bind(py); + let schema = provider + .call_method1("deregister_schema", (name, cascade)) + .map_err(to_datafusion_err)?; + if schema.is_none() { + return Ok(None); + } + + let schema = Arc::new(RustWrappedPySchemaProvider::new(schema.into())) + as Arc; + + Ok(Some(schema)) + }) + } +} + +pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + Ok(()) +} diff --git a/src/common/data_type.rs b/src/common/data_type.rs index f5f8a6b06..5cf9d6e9f 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -172,7 +172,7 @@ impl DataTypeMap { SqlType::DATE, )), DataType::Duration(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Interval(interval_unit) => Ok(DataTypeMap::new( DataType::Interval(*interval_unit), @@ -189,7 +189,7 @@ impl DataTypeMap { SqlType::BINARY, )), DataType::FixedSizeBinary(_) => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", arrow_type)), + DataFusionError::NotImplemented(format!("{arrow_type:?}")), )), DataType::LargeBinary => Ok(DataTypeMap::new( DataType::LargeBinary, @@ -207,23 +207,22 @@ impl DataTypeMap { SqlType::VARCHAR, )), DataType::List(_) => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - arrow_type + "{arrow_type:?}" )))), DataType::FixedSizeList(_, _) => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", arrow_type)), + DataFusionError::NotImplemented(format!("{arrow_type:?}")), )), DataType::LargeList(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Struct(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Union(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Dictionary(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Decimal128(precision, scale) => Ok(DataTypeMap::new( DataType::Decimal128(*precision, *scale), @@ -236,23 +235,22 @@ impl DataTypeMap { SqlType::DECIMAL, )), DataType::Map(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::RunEndEncoded(_, _) => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", arrow_type)), + DataFusionError::NotImplemented(format!("{arrow_type:?}")), )), DataType::BinaryView => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Utf8View => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - arrow_type + "{arrow_type:?}" )))), DataType::ListView(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::LargeListView(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), } } @@ -379,8 +377,7 @@ impl DataTypeMap { "double" => Ok(DataType::Float64), "byte_array" => Ok(DataType::Utf8), _ => Err(PyValueError::new_err(format!( - "Unable to determine Arrow Data Type from Parquet String type: {:?}", - parquet_str_type + "Unable to determine Arrow Data Type from Parquet String type: {parquet_str_type:?}" ))), }; DataTypeMap::map_from_arrow_type(&arrow_dtype?) @@ -404,12 +401,10 @@ impl DataTypeMap { pub fn py_map_from_sql_type(sql_type: &SqlType) -> PyResult { match sql_type { SqlType::ANY => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::ARRAY => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::BIGINT => Ok(DataTypeMap::new( DataType::Int64, @@ -432,11 +427,10 @@ impl DataTypeMap { SqlType::CHAR, )), SqlType::COLUMN_LIST => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::CURSOR => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::DATE => Ok(DataTypeMap::new( DataType::Date64, @@ -449,8 +443,7 @@ impl DataTypeMap { SqlType::DECIMAL, )), SqlType::DISTINCT => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::DOUBLE => Ok(DataTypeMap::new( DataType::Decimal256(1, 1), @@ -458,7 +451,7 @@ impl DataTypeMap { SqlType::DOUBLE, )), SqlType::DYNAMIC_STAR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::FLOAT => Ok(DataTypeMap::new( DataType::Decimal128(1, 1), @@ -466,8 +459,7 @@ impl DataTypeMap { SqlType::FLOAT, )), SqlType::GEOMETRY => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::INTEGER => Ok(DataTypeMap::new( DataType::Int8, @@ -475,55 +467,52 @@ impl DataTypeMap { SqlType::INTEGER, )), SqlType::INTERVAL => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::INTERVAL_DAY => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_DAY_HOUR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_DAY_MINUTE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_DAY_SECOND => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_HOUR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_HOUR_MINUTE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_HOUR_SECOND => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_MINUTE => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_MINUTE_SECOND => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_MONTH => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_SECOND => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_YEAR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_YEAR_MONTH => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::MAP => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::MULTISET => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::NULL => Ok(DataTypeMap::new( DataType::Null, @@ -531,20 +520,16 @@ impl DataTypeMap { SqlType::NULL, )), SqlType::OTHER => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::REAL => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::ROW => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::SARG => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::SMALLINT => Ok(DataTypeMap::new( DataType::Int16, @@ -552,25 +537,22 @@ impl DataTypeMap { SqlType::SMALLINT, )), SqlType::STRUCTURED => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::SYMBOL => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::TIME => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::TIME_WITH_LOCAL_TIME_ZONE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::TIMESTAMP => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::TIMESTAMP_WITH_LOCAL_TIME_ZONE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::TINYINT => Ok(DataTypeMap::new( DataType::Int8, @@ -578,8 +560,7 @@ impl DataTypeMap { SqlType::TINYINT, )), SqlType::UNKNOWN => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::VARBINARY => Ok(DataTypeMap::new( DataType::LargeBinary, @@ -682,8 +663,7 @@ impl PyDataType { "datetime64" => Ok(DataType::Date64), "object" => Ok(DataType::Utf8), _ => Err(PyValueError::new_err(format!( - "Unable to determine Arrow Data Type from Arrow String type: {:?}", - arrow_str_type + "Unable to determine Arrow Data Type from Arrow String type: {arrow_str_type:?}" ))), }; Ok(PyDataType { diff --git a/src/context.rs b/src/context.rs index 6ce1f12bc..36133a33d 100644 --- a/src/context.rs +++ b/src/context.rs @@ -31,7 +31,7 @@ use uuid::Uuid; use pyo3::exceptions::{PyKeyError, PyValueError}; use pyo3::prelude::*; -use crate::catalog::{PyCatalog, PyTable}; +use crate::catalog::{PyCatalog, PyTable, RustWrappedPyCatalogProvider}; use crate::dataframe::PyDataFrame; use crate::dataset::Dataset; use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; @@ -49,6 +49,7 @@ use crate::utils::{get_global_ctx, get_tokio_runtime, validate_pycapsule, wait_f use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::catalog::CatalogProvider; use datafusion::common::TableReference; use datafusion::common::{exec_err, ScalarValue}; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; @@ -69,8 +70,10 @@ use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, }; +use datafusion_ffi::catalog_provider::{FFI_CatalogProvider, ForeignCatalogProvider}; use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; +use pyo3::IntoPyObjectExt; use tokio::task::JoinHandle; /// Configuration options for a SessionContext @@ -365,7 +368,7 @@ impl PySessionContext { } else { &upstream_host }; - let url_string = format!("{}{}", scheme, derived_host); + let url_string = format!("{scheme}{derived_host}"); let url = Url::parse(&url_string).unwrap(); self.ctx.runtime_env().register_object_store(&url, store); Ok(()) @@ -614,6 +617,34 @@ impl PySessionContext { Ok(()) } + pub fn register_catalog_provider( + &mut self, + name: &str, + provider: Bound<'_, PyAny>, + ) -> PyDataFusionResult<()> { + let provider = if provider.hasattr("__datafusion_catalog_provider__")? { + let capsule = provider + .getattr("__datafusion_catalog_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_catalog_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignCatalogProvider = provider.into(); + Arc::new(provider) as Arc + } else { + match provider.extract::() { + Ok(py_catalog) => py_catalog.catalog, + Err(_) => Arc::new(RustWrappedPyCatalogProvider::new(provider.into())) + as Arc, + } + }; + + let _ = self.ctx.register_catalog(name, provider); + + Ok(()) + } + /// Construct datafusion dataframe from Arrow Table pub fn register_table_provider( &mut self, @@ -845,14 +876,24 @@ impl PySessionContext { } #[pyo3(signature = (name="datafusion"))] - pub fn catalog(&self, name: &str) -> PyResult { - match self.ctx.catalog(name) { - Some(catalog) => Ok(PyCatalog::new(catalog)), - None => Err(PyKeyError::new_err(format!( - "Catalog with name {} doesn't exist.", - &name, - ))), - } + pub fn catalog(&self, name: &str) -> PyResult { + let catalog = self.ctx.catalog(name).ok_or(PyKeyError::new_err(format!( + "Catalog with name {name} doesn't exist." + )))?; + + Python::with_gil(|py| { + match catalog + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => Ok(wrapped_schema.catalog_provider.clone_ref(py)), + None => PyCatalog::from(catalog).into_py_any(py), + } + }) + } + + pub fn catalog_names(&self) -> HashSet { + self.ctx.catalog_names().into_iter().collect() } pub fn tables(&self) -> HashSet { diff --git a/src/expr.rs b/src/expr.rs index 6b1d01d65..f1e002367 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -171,12 +171,10 @@ impl PyExpr { Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_bound_py_any(py)?), Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_bound_py_any(py)?), Expr::ScalarFunction(value) => Err(py_unsupported_variant_err(format!( - "Converting Expr::ScalarFunction to a Python object is not implemented: {:?}", - value + "Converting Expr::ScalarFunction to a Python object is not implemented: {value:?}" ))), Expr::WindowFunction(value) => Err(py_unsupported_variant_err(format!( - "Converting Expr::WindowFunction to a Python object is not implemented: {:?}", - value + "Converting Expr::WindowFunction to a Python object is not implemented: {value:?}" ))), Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_bound_py_any(py)?), Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_bound_py_any(py)?), @@ -188,8 +186,7 @@ impl PyExpr { } #[allow(deprecated)] Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( - "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", - qualifier, options + "Converting Expr::Wildcard to a Python object is not implemented : {qualifier:?} {options:?}" ))), Expr::GroupingSet(value) => { Ok(grouping_set::PyGroupingSet::from(value.clone()).into_bound_py_any(py)?) @@ -198,8 +195,7 @@ impl PyExpr { Ok(placeholder::PyPlaceholder::from(value.clone()).into_bound_py_any(py)?) } Expr::OuterReferenceColumn(data_type, column) => Err(py_unsupported_variant_err(format!( - "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {:?} - {:?}", - data_type, column + "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {data_type:?} - {column:?}" ))), Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_bound_py_any(py)?), } @@ -755,8 +751,7 @@ impl PyExpr { Expr::Cast(Cast { expr: _, data_type }) => DataTypeMap::map_from_arrow_type(data_type), Expr::Literal(scalar_value, _) => DataTypeMap::map_from_scalar_value(scalar_value), _ => Err(py_type_err(format!( - "Non Expr::Literal encountered in types: {:?}", - expr + "Non Expr::Literal encountered in types: {expr:?}" ))), } } diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index a99d83d23..fd4393271 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -116,7 +116,7 @@ impl PyAggregate { } fn __repr__(&self) -> PyResult { - Ok(format!("Aggregate({})", self)) + Ok(format!("Aggregate({self})")) } } diff --git a/src/expr/aggregate_expr.rs b/src/expr/aggregate_expr.rs index c09f116e3..7c5d3d31f 100644 --- a/src/expr/aggregate_expr.rs +++ b/src/expr/aggregate_expr.rs @@ -75,6 +75,6 @@ impl PyAggregateFunction { /// Get a String representation of this column fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/alias.rs b/src/expr/alias.rs index e8e03cfad..40746f200 100644 --- a/src/expr/alias.rs +++ b/src/expr/alias.rs @@ -64,6 +64,6 @@ impl PyAlias { /// Get a String representation of this column fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/analyze.rs b/src/expr/analyze.rs index 62f93cd26..e8081e95b 100644 --- a/src/expr/analyze.rs +++ b/src/expr/analyze.rs @@ -69,7 +69,7 @@ impl PyAnalyze { } fn __repr__(&self) -> PyResult { - Ok(format!("Analyze({})", self)) + Ok(format!("Analyze({self})")) } } diff --git a/src/expr/between.rs b/src/expr/between.rs index a2cac1442..817f1baae 100644 --- a/src/expr/between.rs +++ b/src/expr/between.rs @@ -71,6 +71,6 @@ impl PyBetween { } fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/column.rs b/src/expr/column.rs index 365dbc0d2..50f316f1c 100644 --- a/src/expr/column.rs +++ b/src/expr/column.rs @@ -45,7 +45,7 @@ impl PyColumn { /// Get the column relation fn relation(&self) -> Option { - self.col.relation.as_ref().map(|r| format!("{}", r)) + self.col.relation.as_ref().map(|r| format!("{r}")) } /// Get the fully-qualified column name diff --git a/src/expr/copy_to.rs b/src/expr/copy_to.rs index ebfcb8ebc..473dabfed 100644 --- a/src/expr/copy_to.rs +++ b/src/expr/copy_to.rs @@ -106,7 +106,7 @@ impl PyCopyTo { } fn __repr__(&self) -> PyResult { - Ok(format!("CopyTo({})", self)) + Ok(format!("CopyTo({self})")) } fn __name__(&self) -> PyResult { @@ -129,7 +129,7 @@ impl Display for PyFileType { #[pymethods] impl PyFileType { fn __repr__(&self) -> PyResult { - Ok(format!("FileType({})", self)) + Ok(format!("FileType({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_catalog.rs b/src/expr/create_catalog.rs index f4ea0f517..d2d2ee8f6 100644 --- a/src/expr/create_catalog.rs +++ b/src/expr/create_catalog.rs @@ -81,7 +81,7 @@ impl PyCreateCatalog { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateCatalog({})", self)) + Ok(format!("CreateCatalog({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_catalog_schema.rs b/src/expr/create_catalog_schema.rs index 85f447e1e..e794962f5 100644 --- a/src/expr/create_catalog_schema.rs +++ b/src/expr/create_catalog_schema.rs @@ -81,7 +81,7 @@ impl PyCreateCatalogSchema { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateCatalogSchema({})", self)) + Ok(format!("CreateCatalogSchema({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_external_table.rs b/src/expr/create_external_table.rs index 01ce7d0ca..3e35af006 100644 --- a/src/expr/create_external_table.rs +++ b/src/expr/create_external_table.rs @@ -164,7 +164,7 @@ impl PyCreateExternalTable { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateExternalTable({})", self)) + Ok(format!("CreateExternalTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_function.rs b/src/expr/create_function.rs index 6f3c3f0ff..c02ceebb1 100644 --- a/src/expr/create_function.rs +++ b/src/expr/create_function.rs @@ -163,7 +163,7 @@ impl PyCreateFunction { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateFunction({})", self)) + Ok(format!("CreateFunction({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_index.rs b/src/expr/create_index.rs index 13dadbc3f..0f4b5011a 100644 --- a/src/expr/create_index.rs +++ b/src/expr/create_index.rs @@ -110,7 +110,7 @@ impl PyCreateIndex { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateIndex({})", self)) + Ok(format!("CreateIndex({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_memory_table.rs b/src/expr/create_memory_table.rs index 8872b2d47..37f4d3420 100644 --- a/src/expr/create_memory_table.rs +++ b/src/expr/create_memory_table.rs @@ -78,7 +78,7 @@ impl PyCreateMemoryTable { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateMemoryTable({})", self)) + Ok(format!("CreateMemoryTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_view.rs b/src/expr/create_view.rs index 87bb76876..718e404d0 100644 --- a/src/expr/create_view.rs +++ b/src/expr/create_view.rs @@ -75,7 +75,7 @@ impl PyCreateView { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateView({})", self)) + Ok(format!("CreateView({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/describe_table.rs b/src/expr/describe_table.rs index 5658a13f2..6c48f3c77 100644 --- a/src/expr/describe_table.rs +++ b/src/expr/describe_table.rs @@ -61,7 +61,7 @@ impl PyDescribeTable { } fn __repr__(&self) -> PyResult { - Ok(format!("DescribeTable({})", self)) + Ok(format!("DescribeTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/distinct.rs b/src/expr/distinct.rs index b62b776f8..889e7099d 100644 --- a/src/expr/distinct.rs +++ b/src/expr/distinct.rs @@ -48,8 +48,7 @@ impl Display for PyDistinct { Distinct::All(input) => write!( f, "Distinct ALL - \nInput: {:?}", - input, + \nInput: {input:?}", ), Distinct::On(distinct_on) => { write!( @@ -71,7 +70,7 @@ impl PyDistinct { } fn __repr__(&self) -> PyResult { - Ok(format!("Distinct({})", self)) + Ok(format!("Distinct({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/drop_catalog_schema.rs b/src/expr/drop_catalog_schema.rs index b7420a99c..b4a4c521c 100644 --- a/src/expr/drop_catalog_schema.rs +++ b/src/expr/drop_catalog_schema.rs @@ -101,7 +101,7 @@ impl PyDropCatalogSchema { } fn __repr__(&self) -> PyResult { - Ok(format!("DropCatalogSchema({})", self)) + Ok(format!("DropCatalogSchema({self})")) } } diff --git a/src/expr/drop_function.rs b/src/expr/drop_function.rs index 9fbd78fdc..fca9eb94b 100644 --- a/src/expr/drop_function.rs +++ b/src/expr/drop_function.rs @@ -76,7 +76,7 @@ impl PyDropFunction { } fn __repr__(&self) -> PyResult { - Ok(format!("DropFunction({})", self)) + Ok(format!("DropFunction({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/drop_table.rs b/src/expr/drop_table.rs index 96983c1cf..3f442539a 100644 --- a/src/expr/drop_table.rs +++ b/src/expr/drop_table.rs @@ -70,7 +70,7 @@ impl PyDropTable { } fn __repr__(&self) -> PyResult { - Ok(format!("DropTable({})", self)) + Ok(format!("DropTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/drop_view.rs b/src/expr/drop_view.rs index 1d1ab1e59..6196c8bb5 100644 --- a/src/expr/drop_view.rs +++ b/src/expr/drop_view.rs @@ -83,7 +83,7 @@ impl PyDropView { } fn __repr__(&self) -> PyResult { - Ok(format!("DropView({})", self)) + Ok(format!("DropView({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/empty_relation.rs b/src/expr/empty_relation.rs index a1534ac15..758213423 100644 --- a/src/expr/empty_relation.rs +++ b/src/expr/empty_relation.rs @@ -65,7 +65,7 @@ impl PyEmptyRelation { /// Get a String representation of this column fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } fn __name__(&self) -> PyResult { diff --git a/src/expr/filter.rs b/src/expr/filter.rs index 9bdb667cd..4fcb600cd 100644 --- a/src/expr/filter.rs +++ b/src/expr/filter.rs @@ -72,7 +72,7 @@ impl PyFilter { } fn __repr__(&self) -> String { - format!("Filter({})", self) + format!("Filter({self})") } } diff --git a/src/expr/join.rs b/src/expr/join.rs index 76ec532e7..b8d1d9da7 100644 --- a/src/expr/join.rs +++ b/src/expr/join.rs @@ -177,7 +177,7 @@ impl PyJoin { } fn __repr__(&self) -> PyResult { - Ok(format!("Join({})", self)) + Ok(format!("Join({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/like.rs b/src/expr/like.rs index 2e1f060bd..f180f5d4c 100644 --- a/src/expr/like.rs +++ b/src/expr/like.rs @@ -75,7 +75,7 @@ impl PyLike { } fn __repr__(&self) -> String { - format!("Like({})", self) + format!("Like({self})") } } @@ -133,7 +133,7 @@ impl PyILike { } fn __repr__(&self) -> String { - format!("Like({})", self) + format!("Like({self})") } } @@ -191,6 +191,6 @@ impl PySimilarTo { } fn __repr__(&self) -> String { - format!("Like({})", self) + format!("Like({self})") } } diff --git a/src/expr/limit.rs b/src/expr/limit.rs index c2a33ff89..92552814e 100644 --- a/src/expr/limit.rs +++ b/src/expr/limit.rs @@ -81,7 +81,7 @@ impl PyLimit { } fn __repr__(&self) -> PyResult { - Ok(format!("Limit({})", self)) + Ok(format!("Limit({self})")) } } diff --git a/src/expr/projection.rs b/src/expr/projection.rs index dc7e5e3c1..b5a9ef34a 100644 --- a/src/expr/projection.rs +++ b/src/expr/projection.rs @@ -85,7 +85,7 @@ impl PyProjection { } fn __repr__(&self) -> PyResult { - Ok(format!("Projection({})", self)) + Ok(format!("Projection({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/recursive_query.rs b/src/expr/recursive_query.rs index 65181f7d3..2517b7417 100644 --- a/src/expr/recursive_query.rs +++ b/src/expr/recursive_query.rs @@ -89,7 +89,7 @@ impl PyRecursiveQuery { } fn __repr__(&self) -> PyResult { - Ok(format!("RecursiveQuery({})", self)) + Ok(format!("RecursiveQuery({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/repartition.rs b/src/expr/repartition.rs index 3e782d6af..48b5e7041 100644 --- a/src/expr/repartition.rs +++ b/src/expr/repartition.rs @@ -108,7 +108,7 @@ impl PyRepartition { } fn __repr__(&self) -> PyResult { - Ok(format!("Repartition({})", self)) + Ok(format!("Repartition({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/sort.rs b/src/expr/sort.rs index ed4947591..79a8aee50 100644 --- a/src/expr/sort.rs +++ b/src/expr/sort.rs @@ -87,7 +87,7 @@ impl PySort { } fn __repr__(&self) -> PyResult { - Ok(format!("Sort({})", self)) + Ok(format!("Sort({self})")) } } diff --git a/src/expr/sort_expr.rs b/src/expr/sort_expr.rs index 12f74e4d8..79e35d978 100644 --- a/src/expr/sort_expr.rs +++ b/src/expr/sort_expr.rs @@ -85,6 +85,6 @@ impl PySortExpr { } fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/subquery.rs b/src/expr/subquery.rs index 5ebfe6927..77f56f9a9 100644 --- a/src/expr/subquery.rs +++ b/src/expr/subquery.rs @@ -62,7 +62,7 @@ impl PySubquery { } fn __repr__(&self) -> PyResult { - Ok(format!("Subquery({})", self)) + Ok(format!("Subquery({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/subquery_alias.rs b/src/expr/subquery_alias.rs index 267a4d485..3302e7f23 100644 --- a/src/expr/subquery_alias.rs +++ b/src/expr/subquery_alias.rs @@ -72,7 +72,7 @@ impl PySubqueryAlias { } fn __repr__(&self) -> PyResult { - Ok(format!("SubqueryAlias({})", self)) + Ok(format!("SubqueryAlias({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/table_scan.rs b/src/expr/table_scan.rs index 6a0d53f0f..329964687 100644 --- a/src/expr/table_scan.rs +++ b/src/expr/table_scan.rs @@ -136,7 +136,7 @@ impl PyTableScan { } fn __repr__(&self) -> PyResult { - Ok(format!("TableScan({})", self)) + Ok(format!("TableScan({self})")) } } diff --git a/src/expr/union.rs b/src/expr/union.rs index 5a08ccc13..e0b221398 100644 --- a/src/expr/union.rs +++ b/src/expr/union.rs @@ -66,7 +66,7 @@ impl PyUnion { } fn __repr__(&self) -> PyResult { - Ok(format!("Union({})", self)) + Ok(format!("Union({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/unnest.rs b/src/expr/unnest.rs index 8e70e0990..c8833347f 100644 --- a/src/expr/unnest.rs +++ b/src/expr/unnest.rs @@ -66,7 +66,7 @@ impl PyUnnest { } fn __repr__(&self) -> PyResult { - Ok(format!("Unnest({})", self)) + Ok(format!("Unnest({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/unnest_expr.rs b/src/expr/unnest_expr.rs index 2234d24b1..634186ed8 100644 --- a/src/expr/unnest_expr.rs +++ b/src/expr/unnest_expr.rs @@ -58,7 +58,7 @@ impl PyUnnestExpr { } fn __repr__(&self) -> PyResult { - Ok(format!("UnnestExpr({})", self)) + Ok(format!("UnnestExpr({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/window.rs b/src/expr/window.rs index 052d9eeb4..a408731c2 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -185,8 +185,7 @@ impl PyWindowFrame { "groups" => WindowFrameUnits::Groups, _ => { return Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - units, + "{units:?}", )))); } }; @@ -197,8 +196,7 @@ impl PyWindowFrame { WindowFrameUnits::Rows => WindowFrameBound::Preceding(ScalarValue::UInt64(None)), WindowFrameUnits::Groups => { return Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - units, + "{units:?}", )))); } }, @@ -210,8 +208,7 @@ impl PyWindowFrame { WindowFrameUnits::Range => WindowFrameBound::Following(ScalarValue::UInt64(None)), WindowFrameUnits::Groups => { return Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - units, + "{units:?}", )))); } }, @@ -236,7 +233,7 @@ impl PyWindowFrame { /// Get a String representation of this window frame fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/functions.rs b/src/functions.rs index b2bafcb65..b40500b8b 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -937,7 +937,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(left))?; m.add_wrapped(wrap_pyfunction!(length))?; m.add_wrapped(wrap_pyfunction!(ln))?; - m.add_wrapped(wrap_pyfunction!(log))?; + m.add_wrapped(wrap_pyfunction!(self::log))?; m.add_wrapped(wrap_pyfunction!(log10))?; m.add_wrapped(wrap_pyfunction!(log2))?; m.add_wrapped(wrap_pyfunction!(lower))?; diff --git a/src/lib.rs b/src/lib.rs index 1293eee3c..29d3f41da 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,10 +77,10 @@ pub(crate) struct TokioRuntime(tokio::runtime::Runtime); /// datafusion directory. #[pymodule] fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { + // Initialize logging + pyo3_log::init(); + // Register the python classes - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -98,6 +98,10 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + let catalog = PyModule::new(py, "catalog")?; + catalog::init_module(&catalog)?; + m.add_submodule(&catalog)?; + // Register `common` as a submodule. Matching `datafusion-common` https://docs.rs/datafusion-common/latest/datafusion_common/ let common = PyModule::new(py, "common")?; common::init_module(&common)?; diff --git a/src/physical_plan.rs b/src/physical_plan.rs index f0be45c6a..49db643e1 100644 --- a/src/physical_plan.rs +++ b/src/physical_plan.rs @@ -78,8 +78,7 @@ impl PyExecutionPlan { let proto_plan = datafusion_proto::protobuf::PhysicalPlanNode::decode(bytes).map_err(|e| { PyRuntimeError::new_err(format!( - "Unable to decode logical node from serialized bytes: {}", - e + "Unable to decode logical node from serialized bytes: {e}" )) })?; diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 198d68bdc..97d320470 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -201,8 +201,7 @@ impl PyLogicalPlan { let proto_plan = datafusion_proto::protobuf::LogicalPlanNode::decode(bytes).map_err(|e| { PyRuntimeError::new_err(format!( - "Unable to decode logical node from serialized bytes: {}", - e + "Unable to decode logical node from serialized bytes: {e}" )) })?; diff --git a/src/utils.rs b/src/utils.rs index f4e121fd5..3b30de5de 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -109,8 +109,7 @@ pub(crate) fn validate_pycapsule(capsule: &Bound, name: &str) -> PyRe let capsule_name = capsule_name.unwrap().to_str()?; if capsule_name != name { return Err(PyValueError::new_err(format!( - "Expected name '{}' in PyCapsule, instead got '{}'", - name, capsule_name + "Expected name '{name}' in PyCapsule, instead got '{capsule_name}'" ))); } @@ -127,7 +126,7 @@ pub(crate) fn py_obj_to_scalar_value(py: Python, obj: PyObject) -> PyResult Date: Wed, 2 Jul 2025 09:59:02 -0400 Subject: [PATCH 149/248] feat: add FFI support for user defined functions (#1145) * Intermediate work adding ffi scalar udf * Add scalar UDF and example * Add aggregate udf via ffi * Initial commit for window ffi integration * Remove unused import --- docs/source/contributor-guide/ffi.rst | 2 +- examples/datafusion-ffi-example/Cargo.lock | 217 ++++++++++-------- examples/datafusion-ffi-example/Cargo.toml | 8 +- .../python/tests/_test_aggregate_udf.py | 77 +++++++ .../python/tests/_test_scalar_udf.py | 70 ++++++ .../python/tests/_test_window_udf.py | 89 +++++++ .../src/aggregate_udf.rs | 81 +++++++ .../src/catalog_provider.rs | 1 - examples/datafusion-ffi-example/src/lib.rs | 9 + .../datafusion-ffi-example/src/scalar_udf.rs | 91 ++++++++ .../datafusion-ffi-example/src/window_udf.rs | 81 +++++++ python/datafusion/user_defined.py | 107 ++++++++- src/functions.rs | 2 +- src/udaf.rs | 31 ++- src/udf.rs | 25 +- src/udwf.rs | 27 ++- 16 files changed, 805 insertions(+), 113 deletions(-) create mode 100644 examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py create mode 100644 examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py create mode 100644 examples/datafusion-ffi-example/python/tests/_test_window_udf.py create mode 100644 examples/datafusion-ffi-example/src/aggregate_udf.rs create mode 100644 examples/datafusion-ffi-example/src/scalar_udf.rs create mode 100644 examples/datafusion-ffi-example/src/window_udf.rs diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst index c1f9806b3..a40af1234 100644 --- a/docs/source/contributor-guide/ffi.rst +++ b/docs/source/contributor-guide/ffi.rst @@ -176,7 +176,7 @@ By convention the ``datafusion-python`` library expects a Python object that has ``TableProvider`` PyCapsule to have this capsule accessible by calling a function named ``__datafusion_table_provider__``. You can see a complete working example of how to share a ``TableProvider`` from one python library to DataFusion Python in the -`repository examples folder `_. +`repository examples folder `_. This section has been written using ``TableProvider`` as an example. It is the first extension that has been written using this approach and the most thoroughly implemented. diff --git a/examples/datafusion-ffi-example/Cargo.lock b/examples/datafusion-ffi-example/Cargo.lock index e5a1ca8d1..1b4ca6bee 100644 --- a/examples/datafusion-ffi-example/Cargo.lock +++ b/examples/datafusion-ffi-example/Cargo.lock @@ -323,6 +323,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b" dependencies = [ "bitflags", + "serde", + "serde_json", ] [[package]] @@ -748,9 +750,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080" +checksum = "cc6cb8c2c81eada072059983657d6c9caf3fddefc43b4a65551d243253254a96" dependencies = [ "arrow", "arrow-ipc", @@ -775,7 +777,6 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", - "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -790,7 +791,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.1", "regex", "sqlparser", "tempfile", @@ -803,9 +804,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66" +checksum = "b7be8d1b627843af62e447396db08fe1372d882c0eb8d0ea655fd1fbc33120ee" dependencies = [ "arrow", "async-trait", @@ -829,9 +830,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00" +checksum = "38ab16c5ae43f65ee525fc493ceffbc41f40dee38b01f643dfcfc12959e92038" dependencies = [ "arrow", "async-trait", @@ -852,9 +853,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c" +checksum = "d3d56b2ac9f476b93ca82e4ef5fb00769c8a3f248d12b4965af7e27635fa7e12" dependencies = [ "ahash", "arrow", @@ -876,9 +877,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b" +checksum = "16015071202d6133bc84d72756176467e3e46029f3ce9ad2cb788f9b1ff139b2" dependencies = [ "futures", "log", @@ -887,9 +888,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1" +checksum = "b77523c95c89d2a7eb99df14ed31390e04ab29b43ff793e562bdc1716b07e17b" dependencies = [ "arrow", "async-compression", @@ -912,7 +913,7 @@ dependencies = [ "log", "object_store", "parquet", - "rand", + "rand 0.9.1", "tempfile", "tokio", "tokio-util", @@ -923,9 +924,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6" +checksum = "40d25c5e2c0ebe8434beeea997b8e88d55b3ccc0d19344293f2373f65bc524fc" dependencies = [ "arrow", "async-trait", @@ -948,9 +949,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352" +checksum = "3dc6959e1155741ab35369e1dc7673ba30fc45ed568fad34c01b7cb1daeb4d4c" dependencies = [ "arrow", "async-trait", @@ -973,9 +974,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27d15868ea39ed2dc266728b554f6304acd473de2142281ecfa1294bb7415923" +checksum = "b7a6afdfe358d70f4237f60eaef26ae5a1ce7cb2c469d02d5fc6c7fd5d84e58b" dependencies = [ "arrow", "async-trait", @@ -998,21 +999,21 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.1", "tokio", ] [[package]] name = "datafusion-doc" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292" +checksum = "9bcd8a3e3e3d02ea642541be23d44376b5d5c37c2938cce39b3873cdf7186eea" [[package]] name = "datafusion-execution" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84" +checksum = "670da1d45d045eee4c2319b8c7ea57b26cf48ab77b630aaa50b779e406da476a" dependencies = [ "arrow", "dashmap", @@ -1022,16 +1023,16 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand", + "rand 0.9.1", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964" +checksum = "b3a577f64bdb7e2cc4043cd97f8901d8c504711fde2dbcb0887645b00d7c660b" dependencies = [ "arrow", "chrono", @@ -1050,9 +1051,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" +checksum = "51b7916806ace3e9f41884f230f7f38ebf0e955dfbd88266da1826f29a0b9a6a" dependencies = [ "arrow", "datafusion-common", @@ -1063,9 +1064,9 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cf3fe9ab492c56daeb7beed526690d33622d388b8870472e0b7b7f55490338c" +checksum = "980cca31de37f5dadf7ea18e4ffc2b6833611f45bed5ef9de0831d2abb50f1ef" dependencies = [ "abi_stable", "arrow", @@ -1073,7 +1074,9 @@ dependencies = [ "async-ffi", "async-trait", "datafusion", + "datafusion-functions-aggregate-common", "datafusion-proto", + "datafusion-proto-common", "futures", "log", "prost", @@ -1081,11 +1084,25 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-ffi-example" +version = "0.2.0" +dependencies = [ + "arrow", + "arrow-array", + "arrow-schema", + "async-trait", + "datafusion", + "datafusion-ffi", + "pyo3", + "pyo3-build-config", +] + [[package]] name = "datafusion-functions" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e" +checksum = "7fb31c9dc73d3e0c365063f91139dc273308f8a8e124adda9898db8085d68357" dependencies = [ "arrow", "arrow-buffer", @@ -1103,7 +1120,7 @@ dependencies = [ "itertools", "log", "md-5", - "rand", + "rand 0.9.1", "regex", "sha2", "unicode-segmentation", @@ -1112,9 +1129,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba" +checksum = "ebb72c6940697eaaba9bd1f746a697a07819de952b817e3fb841fb75331ad5d4" dependencies = [ "ahash", "arrow", @@ -1133,9 +1150,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4" +checksum = "d7fdc54656659e5ecd49bf341061f4156ab230052611f4f3609612a0da259696" dependencies = [ "ahash", "arrow", @@ -1146,9 +1163,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d8d50f6334b378930d992d801a10ac5b3e93b846b39e4a05085742572844537" +checksum = "fad94598e3374938ca43bca6b675febe557e7a14eb627d617db427d70d65118b" dependencies = [ "arrow", "arrow-ord", @@ -1167,9 +1184,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a" +checksum = "de2fc6c2946da5cab8364fb28b5cac3115f0f3a87960b235ed031c3f7e2e639b" dependencies = [ "arrow", "async-trait", @@ -1183,10 +1200,11 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193" +checksum = "3e5746548a8544870a119f556543adcd88fe0ba6b93723fe78ad0439e0fbb8b4" dependencies = [ + "arrow", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -1200,9 +1218,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313" +checksum = "dcbe9404382cda257c434f22e13577bee7047031dfdb6216dd5e841b9465e6fe" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1210,9 +1228,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1" +checksum = "8dce50e3b637dab0d25d04d2fe79dfdca2b257eabd76790bffd22c7f90d700c8" dependencies = [ "datafusion-expr", "quote", @@ -1221,9 +1239,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515" +checksum = "03cfaacf06445dc3bbc1e901242d2a44f2cae99a744f49f3fefddcee46240058" dependencies = [ "arrow", "chrono", @@ -1240,9 +1258,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65" +checksum = "1908034a89d7b2630898e06863583ae4c00a0dd310c1589ca284195ee3f7f8a6" dependencies = [ "ahash", "arrow", @@ -1262,9 +1280,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863" +checksum = "47b7a12dd59ea07614b67dbb01d85254fbd93df45bcffa63495e11d3bdf847df" dependencies = [ "ahash", "arrow", @@ -1276,9 +1294,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2" +checksum = "4371cc4ad33978cc2a8be93bd54a232d3f2857b50401a14631c0705f3f910aae" dependencies = [ "arrow", "datafusion-common", @@ -1295,9 +1313,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55" +checksum = "dc47bc33025757a5c11f2cd094c5b6b5ed87f46fa33c023e6fdfa25fcbfade23" dependencies = [ "ahash", "arrow", @@ -1325,9 +1343,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a1afb2bdb05de7ff65be6883ebfd4ec027bd9f1f21c46aa3afd01927160a83" +checksum = "d8f5d9acd7d96e3bf2a7bb04818373cab6e51de0356e3694b94905fee7b4e8b6" dependencies = [ "arrow", "chrono", @@ -1341,9 +1359,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35b7a5876ebd6b564fb9a1fd2c3a2a9686b787071a256b47e4708f0916f9e46f" +checksum = "09ecb5ec152c4353b60f7a5635489834391f7a291d2b39a4820cd469e318b78e" dependencies = [ "arrow", "datafusion-common", @@ -1352,9 +1370,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f" +checksum = "d7485da32283985d6b45bd7d13a65169dcbe8c869e25d01b2cfbc425254b4b49" dependencies = [ "arrow", "async-trait", @@ -1376,9 +1394,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607" +checksum = "a466b15632befddfeac68c125f0260f569ff315c6831538cbb40db754134e0df" dependencies = [ "arrow", "bigdecimal", @@ -1441,20 +1459,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "ffi-table-provider" -version = "0.1.0" -dependencies = [ - "arrow", - "arrow-array", - "arrow-schema", - "async-trait", - "datafusion", - "datafusion-ffi", - "pyo3", - "pyo3-build-config", -] - [[package]] name = "fixedbitset" version = "0.5.7" @@ -1488,6 +1492,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1666,6 +1676,11 @@ name = "hashbrown" version = "0.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "heck" @@ -2271,12 +2286,14 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "petgraph" -version = "0.7.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" dependencies = [ "fixedbitset", + "hashbrown 0.15.3", "indexmap", + "serde", ] [[package]] @@ -2305,7 +2322,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared", - "rand", + "rand 0.8.5", ] [[package]] @@ -2484,19 +2501,27 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ - "libc", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +dependencies = [ "rand_chacha", - "rand_core", + "rand_core 0.9.3", ] [[package]] name = "rand_chacha" -version = "0.3.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.3", ] [[package]] @@ -2504,8 +2529,14 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.3.3", ] [[package]] @@ -3032,9 +3063,9 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.16.0" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" +checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" dependencies = [ "getrandom 0.3.3", "js-sys", diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml index 319163554..b26ab48e3 100644 --- a/examples/datafusion-ffi-example/Cargo.toml +++ b/examples/datafusion-ffi-example/Cargo.toml @@ -16,13 +16,13 @@ # under the License. [package] -name = "ffi-table-provider" -version = "0.1.0" +name = "datafusion-ffi-example" +version = "0.2.0" edition = "2021" [dependencies] -datafusion = { version = "47.0.0" } -datafusion-ffi = { version = "47.0.0" } +datafusion = { version = "48.0.0" } +datafusion-ffi = { version = "48.0.0" } pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } arrow = { version = "55.0.0" } arrow-array = { version = "55.0.0" } diff --git a/examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py b/examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py new file mode 100644 index 000000000..7ea6b295c --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext, col, udaf +from datafusion_ffi_example import MySumUDF + + +def setup_context_with_table(): + ctx = SessionContext() + + # Pick numbers here so we get the same value in both groups + # since we cannot be certain of the output order of batches + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2, 3, None], type=pa.int64()), + pa.array([1, 1, 2, 2], type=pa.int64()), + ], + names=["a", "b"], + ) + ctx.register_record_batches("test_table", [[batch]]) + return ctx + + +def test_ffi_aggregate_register(): + ctx = setup_context_with_table() + my_udaf = udaf(MySumUDF()) + ctx.register_udaf(my_udaf) + + result = ctx.sql("select my_custom_sum(a) from test_table group by b").collect() + + assert len(result) == 2 + assert result[0].num_columns == 1 + + result = [r.column(0) for r in result] + expected = [ + pa.array([3], type=pa.int64()), + pa.array([3], type=pa.int64()), + ] + + assert result == expected + + +def test_ffi_aggregate_call_directly(): + ctx = setup_context_with_table() + my_udaf = udaf(MySumUDF()) + + result = ( + ctx.table("test_table").aggregate([col("b")], [my_udaf(col("a"))]).collect() + ) + + assert len(result) == 2 + assert result[0].num_columns == 2 + + result = [r.column(1) for r in result] + expected = [ + pa.array([3], type=pa.int64()), + pa.array([3], type=pa.int64()), + ] + + assert result == expected diff --git a/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py b/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py new file mode 100644 index 000000000..0c949c34a --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext, col, udf +from datafusion_ffi_example import IsNullUDF + + +def setup_context_with_table(): + ctx = SessionContext() + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3, None])], + names=["a"], + ) + ctx.register_record_batches("test_table", [[batch]]) + return ctx + + +def test_ffi_scalar_register(): + ctx = setup_context_with_table() + my_udf = udf(IsNullUDF()) + ctx.register_udf(my_udf) + + result = ctx.sql("select my_custom_is_null(a) from test_table").collect() + + assert len(result) == 1 + assert result[0].num_columns == 1 + print(result) + + result = [r.column(0) for r in result] + expected = [ + pa.array([False, False, False, True], type=pa.bool_()), + ] + + assert result == expected + + +def test_ffi_scalar_call_directly(): + ctx = setup_context_with_table() + my_udf = udf(IsNullUDF()) + + result = ctx.table("test_table").select(my_udf(col("a"))).collect() + + assert len(result) == 1 + assert result[0].num_columns == 1 + print(result) + + result = [r.column(0) for r in result] + expected = [ + pa.array([False, False, False, True], type=pa.bool_()), + ] + + assert result == expected diff --git a/examples/datafusion-ffi-example/python/tests/_test_window_udf.py b/examples/datafusion-ffi-example/python/tests/_test_window_udf.py new file mode 100644 index 000000000..7d96994b9 --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_window_udf.py @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext, col, udwf +from datafusion_ffi_example import MyRankUDF + + +def setup_context_with_table(): + ctx = SessionContext() + + # Pick numbers here so we get the same value in both groups + # since we cannot be certain of the output order of batches + batch = pa.RecordBatch.from_arrays( + [ + pa.array([40, 10, 30, 20], type=pa.int64()), + ], + names=["a"], + ) + ctx.register_record_batches("test_table", [[batch]]) + return ctx + + +def test_ffi_window_register(): + ctx = setup_context_with_table() + my_udwf = udwf(MyRankUDF()) + ctx.register_udwf(my_udwf) + + result = ctx.sql( + "select a, my_custom_rank() over (order by a) from test_table" + ).collect() + assert len(result) == 1 + assert result[0].num_columns == 2 + + results = [ + (result[0][0][idx].as_py(), result[0][1][idx].as_py()) for idx in range(4) + ] + results.sort() + + expected = [ + (10, 1), + (20, 2), + (30, 3), + (40, 4), + ] + assert results == expected + + +def test_ffi_window_call_directly(): + ctx = setup_context_with_table() + my_udwf = udwf(MyRankUDF()) + + result = ( + ctx.table("test_table") + .select(col("a"), my_udwf().order_by(col("a")).build()) + .collect() + ) + + assert len(result) == 1 + assert result[0].num_columns == 2 + + results = [ + (result[0][0][idx].as_py(), result[0][1][idx].as_py()) for idx in range(4) + ] + results.sort() + + expected = [ + (10, 1), + (20, 2), + (30, 3), + (40, 4), + ] + assert results == expected diff --git a/examples/datafusion-ffi-example/src/aggregate_udf.rs b/examples/datafusion-ffi-example/src/aggregate_udf.rs new file mode 100644 index 000000000..9481fe9c6 --- /dev/null +++ b/examples/datafusion-ffi-example/src/aggregate_udf.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::DataType; +use datafusion::error::Result as DataFusionResult; +use datafusion::functions_aggregate::sum::Sum; +use datafusion::logical_expr::function::AccumulatorArgs; +use datafusion::logical_expr::{Accumulator, AggregateUDF, AggregateUDFImpl, Signature}; +use datafusion_ffi::udaf::FFI_AggregateUDF; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::any::Any; +use std::sync::Arc; + +#[pyclass(name = "MySumUDF", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct MySumUDF { + inner: Arc, +} + +#[pymethods] +impl MySumUDF { + #[new] + fn new() -> Self { + Self { + inner: Arc::new(Sum::new()), + } + } + + fn __datafusion_aggregate_udf__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_aggregate_udf".into(); + + let func = Arc::new(AggregateUDF::from(self.clone())); + let provider = FFI_AggregateUDF::from(func); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl AggregateUDFImpl for MySumUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_custom_sum" + } + + fn signature(&self) -> &Signature { + self.inner.signature() + } + + fn return_type(&self, arg_types: &[DataType]) -> DataFusionResult { + self.inner.return_type(arg_types) + } + + fn accumulator(&self, acc_args: AccumulatorArgs) -> DataFusionResult> { + self.inner.accumulator(acc_args) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> DataFusionResult> { + self.inner.coerce_types(arg_types) + } +} diff --git a/examples/datafusion-ffi-example/src/catalog_provider.rs b/examples/datafusion-ffi-example/src/catalog_provider.rs index 54e61cf3e..cd2616916 100644 --- a/examples/datafusion-ffi-example/src/catalog_provider.rs +++ b/examples/datafusion-ffi-example/src/catalog_provider.rs @@ -24,7 +24,6 @@ use datafusion::{ catalog::{ CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, TableProvider, }, - common::exec_err, datasource::MemTable, error::{DataFusionError, Result}, }; diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index 3a4cf2247..79af276fd 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -16,18 +16,27 @@ // under the License. use crate::catalog_provider::MyCatalogProvider; +use crate::aggregate_udf::MySumUDF; +use crate::scalar_udf::IsNullUDF; use crate::table_function::MyTableFunction; use crate::table_provider::MyTableProvider; +use crate::window_udf::MyRankUDF; use pyo3::prelude::*; pub(crate) mod catalog_provider; +pub(crate) mod aggregate_udf; +pub(crate) mod scalar_udf; pub(crate) mod table_function; pub(crate) mod table_provider; +pub(crate) mod window_udf; #[pymodule] fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/examples/datafusion-ffi-example/src/scalar_udf.rs b/examples/datafusion-ffi-example/src/scalar_udf.rs new file mode 100644 index 000000000..727666638 --- /dev/null +++ b/examples/datafusion-ffi-example/src/scalar_udf.rs @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{Array, BooleanArray}; +use arrow_schema::DataType; +use datafusion::common::ScalarValue; +use datafusion::error::Result as DataFusionResult; +use datafusion::logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use datafusion_ffi::udf::FFI_ScalarUDF; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::any::Any; +use std::sync::Arc; + +#[pyclass(name = "IsNullUDF", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct IsNullUDF { + signature: Signature, +} + +#[pymethods] +impl IsNullUDF { + #[new] + fn new() -> Self { + Self { + signature: Signature::new(TypeSignature::Any(1), Volatility::Immutable), + } + } + + fn __datafusion_scalar_udf__<'py>(&self, py: Python<'py>) -> PyResult> { + let name = cr"datafusion_scalar_udf".into(); + + let func = Arc::new(ScalarUDF::from(self.clone())); + let provider = FFI_ScalarUDF::from(func); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl ScalarUDFImpl for IsNullUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_custom_is_null" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> DataFusionResult { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DataFusionResult { + let input = &args.args[0]; + + Ok(match input { + ColumnarValue::Array(arr) => match arr.is_nullable() { + true => { + let nulls = arr.nulls().unwrap(); + let nulls = BooleanArray::from_iter(nulls.iter().map(|x| Some(!x))); + ColumnarValue::Array(Arc::new(nulls)) + } + false => ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))), + }, + ColumnarValue::Scalar(sv) => { + ColumnarValue::Scalar(ScalarValue::Boolean(Some(sv == &ScalarValue::Null))) + } + }) + } +} diff --git a/examples/datafusion-ffi-example/src/window_udf.rs b/examples/datafusion-ffi-example/src/window_udf.rs new file mode 100644 index 000000000..e0d397956 --- /dev/null +++ b/examples/datafusion-ffi-example/src/window_udf.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::Result as DataFusionResult; +use datafusion::functions_window::rank::rank_udwf; +use datafusion::logical_expr::function::{PartitionEvaluatorArgs, WindowUDFFieldArgs}; +use datafusion::logical_expr::{PartitionEvaluator, Signature, WindowUDF, WindowUDFImpl}; +use datafusion_ffi::udwf::FFI_WindowUDF; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::any::Any; +use std::sync::Arc; + +#[pyclass(name = "MyRankUDF", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct MyRankUDF { + inner: Arc, +} + +#[pymethods] +impl MyRankUDF { + #[new] + fn new() -> Self { + Self { inner: rank_udwf() } + } + + fn __datafusion_window_udf__<'py>(&self, py: Python<'py>) -> PyResult> { + let name = cr"datafusion_window_udf".into(); + + let func = Arc::new(WindowUDF::from(self.clone())); + let provider = FFI_WindowUDF::from(func); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl WindowUDFImpl for MyRankUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_custom_rank" + } + + fn signature(&self) -> &Signature { + self.inner.signature() + } + + fn partition_evaluator( + &self, + partition_evaluator_args: PartitionEvaluatorArgs, + ) -> DataFusionResult> { + self.inner + .inner() + .partition_evaluator(partition_evaluator_args) + } + + fn field(&self, field_args: WindowUDFFieldArgs) -> DataFusionResult { + self.inner.inner().field(field_args) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> DataFusionResult> { + self.inner.coerce_types(arg_types) + } +} diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py index dd634c7fb..bd686acbb 100644 --- a/python/datafusion/user_defined.py +++ b/python/datafusion/user_defined.py @@ -22,7 +22,7 @@ import functools from abc import ABCMeta, abstractmethod from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, overload +from typing import TYPE_CHECKING, Any, Callable, Optional, Protocol, TypeVar, overload import pyarrow as pa @@ -77,6 +77,12 @@ def __str__(self) -> str: return self.name.lower() +class ScalarUDFExportable(Protocol): + """Type hint for object that has __datafusion_scalar_udf__ PyCapsule.""" + + def __datafusion_scalar_udf__(self) -> object: ... # noqa: D105 + + class ScalarUDF: """Class for performing scalar user-defined functions (UDF). @@ -96,6 +102,9 @@ def __init__( See helper method :py:func:`udf` for argument details. """ + if hasattr(func, "__datafusion_scalar_udf__"): + self._udf = df_internal.ScalarUDF.from_pycapsule(func) + return if isinstance(input_types, pa.DataType): input_types = [input_types] self._udf = df_internal.ScalarUDF( @@ -134,6 +143,10 @@ def udf( name: Optional[str] = None, ) -> ScalarUDF: ... + @overload + @staticmethod + def udf(func: ScalarUDFExportable) -> ScalarUDF: ... + @staticmethod def udf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Function (UDF). @@ -147,7 +160,10 @@ def udf(*args: Any, **kwargs: Any): # noqa: D417 Args: func (Callable, optional): Only needed when calling as a function. - Skip this argument when using ``udf`` as a decorator. + Skip this argument when using `udf` as a decorator. If you have a Rust + backed ScalarUDF within a PyCapsule, you can pass this parameter + and ignore the rest. They will be determined directly from the + underlying function. See the online documentation for more information. input_types (list[pa.DataType]): The data types of the arguments to ``func``. This list must be of the same length as the number of arguments. @@ -215,12 +231,31 @@ def wrapper(*args: Any, **kwargs: Any): return decorator + if hasattr(args[0], "__datafusion_scalar_udf__"): + return ScalarUDF.from_pycapsule(args[0]) + if args and callable(args[0]): # Case 1: Used as a function, require the first parameter to be callable return _function(*args, **kwargs) # Case 2: Used as a decorator with parameters return _decorator(*args, **kwargs) + @staticmethod + def from_pycapsule(func: ScalarUDFExportable) -> ScalarUDF: + """Create a Scalar UDF from ScalarUDF PyCapsule object. + + This function will instantiate a Scalar UDF that uses a DataFusion + ScalarUDF that is exported via the FFI bindings. + """ + name = str(func.__class__) + return ScalarUDF( + name=name, + func=func, + input_types=None, + return_type=None, + volatility=None, + ) + class Accumulator(metaclass=ABCMeta): """Defines how an :py:class:`AggregateUDF` accumulates values.""" @@ -242,6 +277,12 @@ def evaluate(self) -> pa.Scalar: """Return the resultant value.""" +class AggregateUDFExportable(Protocol): + """Type hint for object that has __datafusion_aggregate_udf__ PyCapsule.""" + + def __datafusion_aggregate_udf__(self) -> object: ... # noqa: D105 + + class AggregateUDF: """Class for performing scalar user-defined functions (UDF). @@ -263,6 +304,9 @@ def __init__( See :py:func:`udaf` for a convenience function and argument descriptions. """ + if hasattr(accumulator, "__datafusion_aggregate_udf__"): + self._udaf = df_internal.AggregateUDF.from_pycapsule(accumulator) + return self._udaf = df_internal.AggregateUDF( name, accumulator, @@ -307,7 +351,7 @@ def udaf( ) -> AggregateUDF: ... @staticmethod - def udaf(*args: Any, **kwargs: Any): # noqa: D417 + def udaf(*args: Any, **kwargs: Any): # noqa: D417, C901 """Create a new User-Defined Aggregate Function (UDAF). This class allows you to define an aggregate function that can be used in @@ -364,6 +408,10 @@ def udf4() -> Summarize: Args: accum: The accumulator python function. Only needed when calling as a function. Skip this argument when using ``udaf`` as a decorator. + If you have a Rust backed AggregateUDF within a PyCapsule, you can + pass this parameter and ignore the rest. They will be determined + directly from the underlying function. See the online documentation + for more information. input_types: The data types of the arguments to ``accum``. return_type: The data type of the return value. state_type: The data types of the intermediate accumulation. @@ -422,12 +470,32 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: return decorator + if hasattr(args[0], "__datafusion_aggregate_udf__"): + return AggregateUDF.from_pycapsule(args[0]) + if args and callable(args[0]): # Case 1: Used as a function, require the first parameter to be callable return _function(*args, **kwargs) # Case 2: Used as a decorator with parameters return _decorator(*args, **kwargs) + @staticmethod + def from_pycapsule(func: AggregateUDFExportable) -> AggregateUDF: + """Create an Aggregate UDF from AggregateUDF PyCapsule object. + + This function will instantiate a Aggregate UDF that uses a DataFusion + AggregateUDF that is exported via the FFI bindings. + """ + name = str(func.__class__) + return AggregateUDF( + name=name, + accumulator=func, + input_types=None, + return_type=None, + state_type=None, + volatility=None, + ) + class WindowEvaluator: """Evaluator class for user-defined window functions (UDWF). @@ -588,6 +656,12 @@ def include_rank(self) -> bool: return False +class WindowUDFExportable(Protocol): + """Type hint for object that has __datafusion_window_udf__ PyCapsule.""" + + def __datafusion_window_udf__(self) -> object: ... # noqa: D105 + + class WindowUDF: """Class for performing window user-defined functions (UDF). @@ -608,6 +682,9 @@ def __init__( See :py:func:`udwf` for a convenience function and argument descriptions. """ + if hasattr(func, "__datafusion_window_udf__"): + self._udwf = df_internal.WindowUDF.from_pycapsule(func) + return self._udwf = df_internal.WindowUDF( name, func, input_types, return_type, str(volatility) ) @@ -683,7 +760,10 @@ def biased_numbers() -> BiasedNumbers: Args: func: Only needed when calling as a function. Skip this argument when - using ``udwf`` as a decorator. + using ``udwf`` as a decorator. If you have a Rust backed WindowUDF + within a PyCapsule, you can pass this parameter and ignore the rest. + They will be determined directly from the underlying function. See + the online documentation for more information. input_types: The data types of the arguments. return_type: The data type of the return value. volatility: See :py:class:`Volatility` for allowed values. @@ -692,6 +772,9 @@ def biased_numbers() -> BiasedNumbers: Returns: A user-defined window function that can be used in window function calls. """ + if hasattr(args[0], "__datafusion_window_udf__"): + return WindowUDF.from_pycapsule(args[0]) + if args and callable(args[0]): # Case 1: Used as a function, require the first parameter to be callable return WindowUDF._create_window_udf(*args, **kwargs) @@ -759,6 +842,22 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: return decorator + @staticmethod + def from_pycapsule(func: WindowUDFExportable) -> WindowUDF: + """Create a Window UDF from WindowUDF PyCapsule object. + + This function will instantiate a Window UDF that uses a DataFusion + WindowUDF that is exported via the FFI bindings. + """ + name = str(func.__class__) + return WindowUDF( + name=name, + func=func, + input_types=None, + return_type=None, + volatility=None, + ) + class TableFunction: """Class for performing user-defined table functions (UDTF). diff --git a/src/functions.rs b/src/functions.rs index b40500b8b..eeef48385 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -682,7 +682,7 @@ pub fn approx_percentile_cont_with_weight( add_builder_fns_to_aggregate(agg_fn, None, filter, None, None) } -// We handle first_value explicitly because the signature expects an order_by +// We handle last_value explicitly because the signature expects an order_by // https://github.com/apache/datafusion/issues/12376 #[pyfunction] #[pyo3(signature = (expr, distinct=None, filter=None, order_by=None, null_treatment=None))] diff --git a/src/udaf.rs b/src/udaf.rs index 34a9cd51d..78f4e2b0c 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -19,6 +19,10 @@ use std::sync::Arc; use pyo3::{prelude::*, types::PyTuple}; +use crate::common::data_type::PyScalarValue; +use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; +use crate::expr::PyExpr; +use crate::utils::{parse_volatility, validate_pycapsule}; use datafusion::arrow::array::{Array, ArrayRef}; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; @@ -27,11 +31,8 @@ use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{ create_udaf, Accumulator, AccumulatorFactoryFunction, AggregateUDF, }; - -use crate::common::data_type::PyScalarValue; -use crate::errors::to_datafusion_err; -use crate::expr::PyExpr; -use crate::utils::parse_volatility; +use datafusion_ffi::udaf::{FFI_AggregateUDF, ForeignAggregateUDF}; +use pyo3::types::PyCapsule; #[derive(Debug)] struct RustAccumulator { @@ -183,6 +184,26 @@ impl PyAggregateUDF { Ok(Self { function }) } + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_aggregate_udf__")? { + let capsule = func.getattr("__datafusion_aggregate_udf__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_aggregate_udf")?; + + let udaf = unsafe { capsule.reference::() }; + let udaf: ForeignAggregateUDF = udaf.try_into()?; + + Ok(Self { + function: udaf.into(), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_aggregate_udf__ does not exist on AggregateUDF object.".to_string(), + )) + } + } + /// creates a new PyExpr with the call of the udf #[pyo3(signature = (*args))] fn __call__(&self, args: Vec) -> PyResult { diff --git a/src/udf.rs b/src/udf.rs index 574c9d7b5..de1e3f18c 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -17,6 +17,8 @@ use std::sync::Arc; +use datafusion_ffi::udf::{FFI_ScalarUDF, ForeignScalarUDF}; +use pyo3::types::PyCapsule; use pyo3::{prelude::*, types::PyTuple}; use datafusion::arrow::array::{make_array, Array, ArrayData, ArrayRef}; @@ -29,8 +31,9 @@ use datafusion::logical_expr::ScalarUDF; use datafusion::logical_expr::{create_udf, ColumnarValue}; use crate::errors::to_datafusion_err; +use crate::errors::{py_datafusion_err, PyDataFusionResult}; use crate::expr::PyExpr; -use crate::utils::parse_volatility; +use crate::utils::{parse_volatility, validate_pycapsule}; /// Create a Rust callable function from a python function that expects pyarrow arrays fn pyarrow_function_to_rust( @@ -105,6 +108,26 @@ impl PyScalarUDF { Ok(Self { function }) } + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_scalar_udf__")? { + let capsule = func.getattr("__datafusion_scalar_udf__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_scalar_udf")?; + + let udf = unsafe { capsule.reference::() }; + let udf: ForeignScalarUDF = udf.try_into()?; + + Ok(Self { + function: udf.into(), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_scalar_udf__ does not exist on ScalarUDF object.".to_string(), + )) + } + } + /// creates a new PyExpr with the call of the udf #[pyo3(signature = (*args))] fn __call__(&self, args: Vec) -> PyResult { diff --git a/src/udwf.rs b/src/udwf.rs index a0c8cc59a..4fb98916b 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -27,16 +27,17 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use crate::common::data_type::PyScalarValue; -use crate::errors::to_datafusion_err; +use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; use crate::expr::PyExpr; -use crate::utils::parse_volatility; +use crate::utils::{parse_volatility, validate_pycapsule}; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow}; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{ PartitionEvaluator, PartitionEvaluatorFactory, Signature, Volatility, WindowUDF, WindowUDFImpl, }; -use pyo3::types::{PyList, PyTuple}; +use datafusion_ffi::udwf::{FFI_WindowUDF, ForeignWindowUDF}; +use pyo3::types::{PyCapsule, PyList, PyTuple}; #[derive(Debug)] struct RustPartitionEvaluator { @@ -245,6 +246,26 @@ impl PyWindowUDF { Ok(self.function.call(args).into()) } + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_window_udf__")? { + let capsule = func.getattr("__datafusion_window_udf__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_window_udf")?; + + let udwf = unsafe { capsule.reference::() }; + let udwf: ForeignWindowUDF = udwf.try_into()?; + + Ok(Self { + function: udwf.into(), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_window_udf__ does not exist on WindowUDF object.".to_string(), + )) + } + } + fn __repr__(&self) -> PyResult { Ok(format!("WindowUDF({})", self.function.name())) } From 2e1b71369eefc97c22b82be84bbabb414f748fb9 Mon Sep 17 00:00:00 2001 From: kosiew Date: Fri, 4 Jul 2025 20:36:05 +0800 Subject: [PATCH 150/248] refactor: style loading logic in DataFrameHtmlFormatter (#1177) --- python/datafusion/dataframe_formatter.py | 117 ++++++++--------------- python/tests/test_dataframe.py | 60 +++++------- 2 files changed, 63 insertions(+), 114 deletions(-) diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index 27f00f9c3..2323224b8 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -135,9 +135,6 @@ class DataFrameHtmlFormatter: session """ - # Class variable to track if styles have been loaded in the notebook - _styles_loaded = False - def __init__( self, max_cell_length: int = 25, @@ -260,23 +257,6 @@ def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: """ self._custom_header_builder = builder - @classmethod - def is_styles_loaded(cls) -> bool: - """Check if HTML styles have been loaded in the current session. - - This method is primarily intended for debugging UI rendering issues - related to style loading. - - Returns: - True if styles have been loaded, False otherwise - - Example: - >>> from datafusion.dataframe_formatter import DataFrameHtmlFormatter - >>> DataFrameHtmlFormatter.is_styles_loaded() - False - """ - return cls._styles_loaded - def format_html( self, batches: list, @@ -315,18 +295,7 @@ def format_html( # Build HTML components html = [] - # Only include styles and scripts if: - # 1. Not using shared styles, OR - # 2. Using shared styles but they haven't been loaded yet - include_styles = ( - not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded - ) - - if include_styles: - html.extend(self._build_html_header()) - # If we're using shared styles, mark them as loaded - if self.use_shared_styles: - DataFrameHtmlFormatter._styles_loaded = True + html.extend(self._build_html_header()) html.extend(self._build_table_container_start()) @@ -338,7 +307,7 @@ def format_html( html.append("") # Add footer (JavaScript and messages) - if include_styles and self.enable_cell_expansion: + if self.enable_cell_expansion: html.append(self._get_javascript()) # Always add truncation message if needed (independent of styles) @@ -375,14 +344,20 @@ def format_str( def _build_html_header(self) -> list[str]: """Build the HTML header with CSS styles.""" - html = [] - html.append("") + html.append(f"") return html def _build_table_container_start(self) -> list[str]: @@ -570,28 +545,31 @@ def _get_default_css(self) -> str: def _get_javascript(self) -> str: """Get JavaScript code for interactive elements.""" return """ - - """ + +""" class FormatterManager: @@ -712,24 +690,9 @@ def reset_formatter() -> None: >>> reset_formatter() # Reset formatter to default settings """ formatter = DataFrameHtmlFormatter() - # Reset the styles_loaded flag to ensure styles will be reloaded - DataFrameHtmlFormatter._styles_loaded = False set_formatter(formatter) -def reset_styles_loaded_state() -> None: - """Reset the styles loaded state to force reloading of styles. - - This can be useful when switching between notebook sessions or - when styles need to be refreshed. - - Example: - >>> from datafusion.html_formatter import reset_styles_loaded_state - >>> reset_styles_loaded_state() # Force styles to reload in next render - """ - DataFrameHtmlFormatter._styles_loaded = False - - def _refresh_formatter_reference() -> None: """Refresh formatter reference in any modules using it. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index c9ae38d8e..a3870ead8 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -42,7 +42,6 @@ configure_formatter, get_formatter, reset_formatter, - reset_styles_loaded_state, ) from datafusion.expr import Window from pyarrow.csv import write_csv @@ -2177,27 +2176,15 @@ def test_html_formatter_shared_styles(df, clean_formatter_state): # First, ensure we're using shared styles configure_formatter(use_shared_styles=True) - # Get HTML output for first table - should include styles html_first = df._repr_html_() - - # Verify styles are included in first render - assert "
" - f"{field.name}
" - f"
" - "" - "" - f"{formatted_value}" - f"" - f"
" - f"
{formatted_value}