From ea90ad750e8dfbf3171b641aa494fca16661e751 Mon Sep 17 00:00:00 2001 From: Chitral Verma Date: Mon, 4 Sep 2023 13:45:26 +0530 Subject: [PATCH 1/7] Allow async schema inference for file formats Signed-off-by: Chitral Verma --- crates/polars-io/Cargo.toml | 28 +- crates/polars-io/src/csv/utils.rs | 2 +- .../polars-io/src/input/files_async/avro.rs | 39 + crates/polars-io/src/input/files_async/csv.rs | 110 ++ crates/polars-io/src/input/files_async/ipc.rs | 37 + crates/polars-io/src/input/files_async/mod.rs | 94 ++ .../polars-io/src/input/files_async/ndjson.rs | 79 ++ .../src/input/files_async/parquet.rs | 36 + crates/polars-io/src/input/mod.rs | 1 + crates/polars-io/src/lib.rs | 2 + py-polars/Cargo.lock | 1037 ++++++++++++++++- 11 files changed, 1445 insertions(+), 20 deletions(-) create mode 100644 crates/polars-io/src/input/files_async/avro.rs create mode 100644 crates/polars-io/src/input/files_async/csv.rs create mode 100644 crates/polars-io/src/input/files_async/ipc.rs create mode 100644 crates/polars-io/src/input/files_async/mod.rs create mode 100644 crates/polars-io/src/input/files_async/ndjson.rs create mode 100644 crates/polars-io/src/input/files_async/parquet.rs create mode 100644 crates/polars-io/src/input/mod.rs diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 003b36abef42..a4bd61a6757d 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -18,13 +18,13 @@ polars-utils = { version = "0.32.0", path = "../polars-utils" } ahash = { workspace = true } arrow = { workspace = true } -async-trait = { version = "0.1.59", optional = true } +async-trait = { version = "0.1.59" } bytes = { version = "1.3" } chrono = { workspace = true, optional = true } chrono-tz = { workspace = true, optional = true } fast-float = { version = "0.2", optional = true } flate2 = { version = "1", optional = true, default-features = false } -futures = { workspace = true, optional = true } +futures = { workspace = true } lexical = { version = "6", optional = true, default-features = false, features = ["std", "parse-integers"] } lexical-core = { version = "0.8", optional = true } memchr = { workspace = true } @@ -38,9 +38,22 @@ serde = { workspace = true, features = ["derive"], optional = true } serde_json = { version = "1", default-features = false, features = ["alloc", "raw_value"], optional = true } simd-json = { workspace = true, optional = true } simdutf8 = { version = "0.1", optional = true } -tokio = { version = "1.26", features = ["net"], optional = true } +tokio = { version = "1.26", features = ["net", "io-util"] } +tokio-util = { version = "0.7.8", features = ["io-util"] } url = { workspace = true, optional = true } +[dependencies.opendal] +version = "0" +default-features = false +features = [ + "services-azblob", + "services-azdfs", + "services-gcs", + "services-s3", + "services-http", + "services-webhdfs", +] + [target.'cfg(not(target_family = "wasm"))'.dependencies] home = "0.5.4" @@ -59,14 +72,15 @@ json = [ "lexical-core", "serde_json", "dtype-struct", + "arrow/io_json_read", ] # support for arrows ipc file parsing -ipc = ["arrow/io_ipc", "arrow/io_ipc_compression", "memmap"] +ipc = ["arrow/io_ipc", "arrow/io_ipc_compression", "memmap", "arrow/io_ipc_read_async"] # support for arrows streaming ipc file parsing ipc_streaming = ["arrow/io_ipc", "arrow/io_ipc_compression"] # support for arrow avro parsing -avro = ["arrow/io_avro", "arrow/io_avro_compression"] -csv = ["memmap", "lexical", "polars-core/rows", "lexical-core", "fast-float", "simdutf8"] +avro = ["arrow/io_avro", "arrow/io_avro_compression", "arrow/io_avro_async"] +csv = ["memmap", "lexical", "polars-core/rows", "lexical-core", "fast-float", "simdutf8", "arrow/io_csv_read_async"] decompress = ["flate2/rust_backend"] decompress-fast = ["flate2/zlib-ng"] dtype-categorical = ["polars-core/dtype-categorical"] @@ -88,7 +102,7 @@ dtype-decimal = ["polars-core/dtype-decimal"] fmt = ["polars-core/fmt"] lazy = [] parquet = ["polars-core/parquet", "arrow/io_parquet", "arrow/io_parquet_compression", "memmap"] -async = ["async-trait", "futures", "tokio", "arrow/io_ipc_write_async", "polars-error/regex"] +async = ["arrow/io_ipc_write_async", "polars-error/regex"] cloud = ["object_store", "async", "url"] aws = ["object_store/aws", "cloud", "polars-core/aws"] azure = ["object_store/azure", "cloud", "polars-core/azure"] diff --git a/crates/polars-io/src/csv/utils.rs b/crates/polars-io/src/csv/utils.rs index e9c89d1a2ab7..c9b322cc572b 100644 --- a/crates/polars-io/src/csv/utils.rs +++ b/crates/polars-io/src/csv/utils.rs @@ -102,7 +102,7 @@ static BOOLEAN_RE: Lazy = Lazy::new(|| { }); /// Infer the data type of a record -fn infer_field_schema(string: &str, try_parse_dates: bool) -> DataType { +pub(crate) fn infer_field_schema(string: &str, try_parse_dates: bool) -> DataType { // when quoting is enabled in the reader, these quotes aren't escaped, we default to // Utf8 for them if string.starts_with('"') { diff --git a/crates/polars-io/src/input/files_async/avro.rs b/crates/polars-io/src/input/files_async/avro.rs new file mode 100644 index 000000000000..1e3d24536367 --- /dev/null +++ b/crates/polars-io/src/input/files_async/avro.rs @@ -0,0 +1,39 @@ +use async_trait::async_trait; +use opendal::Operator; +use polars_core::prelude::ArrowSchema; +use polars_error::{to_compute_err, PolarsResult}; + +use crate::input::files_async::FileFormat; + +#[derive(Debug, Default)] +pub struct AvroFormat {} + +impl AvroFormat { + /// Construct a new Format with no local overrides + pub fn new() -> Self { + Self::default() + } +} + +#[async_trait] +impl FileFormat for AvroFormat { + /// Read and parse the schema of the Avro file at location `path` + async fn fetch_schema_async( + &self, + store_op: &Operator, + path: String, + ) -> PolarsResult { + let mut reader = store_op + .reader(path.as_str()) + .await + .map_err(to_compute_err)?; + + let metadata = arrow::io::avro::avro_schema::read_async::read_metadata(&mut reader) + .await + .map_err(to_compute_err)?; + let schema = + arrow::io::avro::read::infer_schema(&metadata.record).map_err(to_compute_err)?; + + Ok(schema) + } +} diff --git a/crates/polars-io/src/input/files_async/csv.rs b/crates/polars-io/src/input/files_async/csv.rs new file mode 100644 index 000000000000..3839d512003a --- /dev/null +++ b/crates/polars-io/src/input/files_async/csv.rs @@ -0,0 +1,110 @@ +use async_trait::async_trait; +use opendal::Operator; +use polars_core::prelude::ArrowSchema; +use polars_error::{to_compute_err, PolarsResult}; + +use crate::input::files_async::{FileFormat, DEFAULT_SCHEMA_INFER_MAX_RECORD}; + +#[derive(Debug)] +pub struct CSVFormat { + schema_infer_max_records: Option, + has_header: bool, + delimiter: u8, + comment_char: Option, + try_parse_dates: bool, +} + +impl Default for CSVFormat { + fn default() -> Self { + Self { + schema_infer_max_records: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD), + has_header: true, + delimiter: b',', + comment_char: None, + try_parse_dates: true, + } + } +} + +impl CSVFormat { + /// Construct a new Format with no local overrides + pub fn new() -> Self { + Self::default() + } + + /// Sets a limit in terms of records to scan to infer the schema + /// The default is `DEFAULT_SCHEMA_INFER_MAX_RECORD` + pub fn set_schema_infer_max_records(mut self, schema_infer_max_records: Option) -> Self { + self.schema_infer_max_records = schema_infer_max_records; + self + } + + /// Whether to treat the first row as a special header row. + /// The default is `true`. + pub fn set_has_header(mut self, has_header: bool) -> Self { + self.has_header = has_header; + self + } + + /// Sets the field delimiter to use when parsing CSV. + /// + /// The default is `b','`. + pub fn set_delimiter(mut self, delimiter: u8) -> Self { + self.delimiter = delimiter; + self + } + + /// The comment character to use when parsing CSV. + /// If the start of a record begins with the byte given here, then that line is ignored by the CSV parser. + /// This is disabled by default. + pub fn set_comment_char(mut self, comment_char: Option) -> Self { + self.comment_char = comment_char; + self + } + + /// Automatically try to parse dates/ datetimes and time. + /// If parsing fails, columns remain of dtype `[DataType::Utf8]`. + /// The default is `true`. + pub fn set_try_parse_dates(mut self, try_parse_dates: bool) -> Self { + self.try_parse_dates = try_parse_dates; + self + } +} + +#[async_trait] +impl FileFormat for CSVFormat { + /// Read and parse the schema of the CSV file at location `path` + async fn fetch_schema_async( + &self, + store_op: &Operator, + path: String, + ) -> PolarsResult { + let reader = store_op + .reader(path.as_str()) + .await + .map_err(to_compute_err)?; + + let mut async_reader = arrow::io::csv::read_async::AsyncReaderBuilder::new() + .delimiter(self.delimiter) + .comment(self.comment_char) + .has_headers(self.has_header) + .create_reader(reader); + + let (fields, _) = arrow::io::csv::read_async::infer_schema( + &mut async_reader, + self.schema_infer_max_records, + self.has_header, + &|input: &[u8]| { + crate::csv::utils::infer_field_schema( + std::str::from_utf8(input).unwrap(), + self.try_parse_dates, + ) + .to_arrow() + }, + ) + .await + .map_err(to_compute_err)?; + + Ok(ArrowSchema::from(fields)) + } +} diff --git a/crates/polars-io/src/input/files_async/ipc.rs b/crates/polars-io/src/input/files_async/ipc.rs new file mode 100644 index 000000000000..547d64f71269 --- /dev/null +++ b/crates/polars-io/src/input/files_async/ipc.rs @@ -0,0 +1,37 @@ +use async_trait::async_trait; +use opendal::Operator; +use polars_core::prelude::ArrowSchema; +use polars_error::{to_compute_err, PolarsResult}; + +use crate::input::files_async::FileFormat; + +#[derive(Debug, Default)] +pub struct IPCFormat {} + +impl IPCFormat { + /// Construct a new Format with no local overrides + pub fn new() -> Self { + Self::default() + } +} + +#[async_trait] +impl FileFormat for IPCFormat { + /// Read and parse the schema of the IPC file at location `path` + async fn fetch_schema_async( + &self, + store_op: &Operator, + path: String, + ) -> PolarsResult { + let mut reader = store_op + .reader(path.as_str()) + .await + .map_err(to_compute_err)?; + + let metadata = arrow::io::ipc::read::file_async::read_file_metadata_async(&mut reader) + .await + .map_err(to_compute_err)?; + + Ok(metadata.schema) + } +} diff --git a/crates/polars-io/src/input/files_async/mod.rs b/crates/polars-io/src/input/files_async/mod.rs new file mode 100644 index 000000000000..2e929d152643 --- /dev/null +++ b/crates/polars-io/src/input/files_async/mod.rs @@ -0,0 +1,94 @@ +use async_trait::async_trait; +use futures::{StreamExt, TryStreamExt}; +use opendal::Operator; +use polars_core::prelude::{ArrowSchema, Schema}; +use polars_error::{polars_bail, polars_ensure, to_compute_err, PolarsResult}; + +#[cfg(feature = "avro")] +pub mod avro; +#[cfg(feature = "csv")] +pub mod csv; +#[cfg(any(feature = "ipc", feature = "ipc_streaming"))] +pub mod ipc; +#[cfg(feature = "json")] +pub mod ndjson; +#[cfg(feature = "parquet")] +pub mod parquet; + +pub trait FileFormatOptions {} + +/// Default max records to scan to infer the schema +const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000; + +/// The number of files to read in parallel when inferring schema +const SCHEMA_INFERENCE_CONCURRENCY: usize = 32; + +#[async_trait] +pub trait FileFormat: Send + Sync + std::fmt::Debug { + /// Infer the common schema of the provided objects. + /// For more than one file, the schema of all the files must be merge-able if `strict_schema == true` + /// or else this might fail. + /// The implementations handle whether the schema inference from either file metadata or from its content. + async fn infer_schema_async( + &self, + store_op: &Operator, + objects: Vec, + strict_schema: bool, + ) -> PolarsResult { + polars_ensure!(!objects.is_empty(), NoData: "at least one path must be provided to infer schema"); + + let schemas: Vec<_> = futures::stream::iter(objects) + .map(|object| self.fetch_schema_async(store_op, object)) + .boxed() // Workaround https://github.com/rust-lang/rust/issues/64552 + .buffered(SCHEMA_INFERENCE_CONCURRENCY) + .try_collect() + .await?; + + self.handle_schema(schemas, strict_schema) + } + + /// Read and parse the schema of the Avro file at location `path` + async fn fetch_schema_async( + &self, + store_op: &Operator, + path: String, + ) -> PolarsResult; + + // fn read( + // &self, + // n_rows: Option, + // columns: Option>, + // predicate: Option>, + // projection: Option>, + // options: O, + // ) -> PolarsResult; + // + // fn get_batches(&self) -> PolarsResult> { + // polars_bail!(ComputeError: "Functionality `get_batches` is currently not supported.") + // } + + fn handle_schema( + &self, + schemas: Vec, + strict_schema: bool, + ) -> PolarsResult { + let schema = if strict_schema { + let s = schemas + .windows(2) + .all(|a| a[0] == a[1]) + .then(|| &schemas[0]) + .ok_or(to_compute_err("Schemas of all files must match."))?; + + Schema::from_iter(s.clone().fields.iter()) + } else { + let mut default_schema = Schema::default(); + for s in schemas { + default_schema.merge(Schema::from_iter(s.fields.iter())) + } + + default_schema + }; + + Ok(schema) + } +} diff --git a/crates/polars-io/src/input/files_async/ndjson.rs b/crates/polars-io/src/input/files_async/ndjson.rs new file mode 100644 index 000000000000..aa012f16f80f --- /dev/null +++ b/crates/polars-io/src/input/files_async/ndjson.rs @@ -0,0 +1,79 @@ +use arrow::array::StructArray; +use async_trait::async_trait; +use opendal::Operator; +use polars_core::prelude::ArrowSchema; +use polars_error::{to_compute_err, PolarsResult}; +use tokio::io::AsyncBufReadExt; +use tokio_util::io::StreamReader; + +use crate::input::files_async::{FileFormat, DEFAULT_SCHEMA_INFER_MAX_RECORD}; + +#[derive(Debug)] +pub struct NdJSONFormat { + schema_infer_max_records: Option, +} + +impl Default for NdJSONFormat { + fn default() -> Self { + Self { + schema_infer_max_records: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD), + } + } +} + +impl NdJSONFormat { + /// Construct a new Format with no local overrides + pub fn new() -> Self { + Self::default() + } + + /// Sets a limit in terms of records to scan to infer the schema + /// The default is `DEFAULT_SCHEMA_INFER_MAX_RECORD` + pub fn set_schema_infer_max_records(mut self, schema_infer_max_records: Option) -> Self { + self.schema_infer_max_records = schema_infer_max_records; + self + } +} + +#[async_trait] +impl FileFormat for NdJSONFormat { + /// Read and parse the schema of the ndjson/ jsonl file at location `path` + async fn fetch_schema_async( + &self, + store_op: &Operator, + path: String, + ) -> PolarsResult { + let reader = store_op + .reader(path.as_str()) + .await + .map_err(to_compute_err)?; + + let mut stream_reader = StreamReader::new(reader); + + let mut line = String::new(); + let mut lines = Vec::new(); + + loop { + line.clear(); + let len = stream_reader + .read_line(&mut line) + .await + .map_err(to_compute_err)?; + if len == 0 + || lines.len() + >= self + .schema_infer_max_records + .unwrap_or(DEFAULT_SCHEMA_INFER_MAX_RECORD) + { + break; + } + + lines.push(line.clone()); + } + + let dt = arrow::io::ndjson::read::infer_iter(lines.iter()).map_err(to_compute_err)?; + let schema = ArrowSchema::from(StructArray::get_fields(&dt).to_vec()); + + Ok(schema) + } +} diff --git a/crates/polars-io/src/input/files_async/parquet.rs b/crates/polars-io/src/input/files_async/parquet.rs new file mode 100644 index 000000000000..0aaade150587 --- /dev/null +++ b/crates/polars-io/src/input/files_async/parquet.rs @@ -0,0 +1,36 @@ +use async_trait::async_trait; +use opendal::Operator; +use polars_core::prelude::ArrowSchema; +use polars_error::{to_compute_err, PolarsResult}; + +use crate::input::files_async::FileFormat; + +#[derive(Debug, Default)] +pub struct ParquetFormat {} + +impl ParquetFormat { + /// Construct a new Format with no local overrides + pub fn new() -> Self { + Self::default() + } +} + +#[async_trait] +impl FileFormat for ParquetFormat { + /// Read and parse the schema of the Parquet file at location `path` + async fn fetch_schema_async( + &self, + store_op: &Operator, + path: String, + ) -> PolarsResult { + let mut reader = store_op + .reader(path.as_str()) + .await + .map_err(to_compute_err)?; + + let metadata = arrow::io::parquet::read::read_metadata_async(&mut reader).await?; + let schema = arrow::io::parquet::read::infer_schema(&metadata).map_err(to_compute_err)?; + + Ok(schema) + } +} diff --git a/crates/polars-io/src/input/mod.rs b/crates/polars-io/src/input/mod.rs new file mode 100644 index 000000000000..9bb646c2405b --- /dev/null +++ b/crates/polars-io/src/input/mod.rs @@ -0,0 +1 @@ +pub mod files_async; diff --git a/crates/polars-io/src/lib.rs b/crates/polars-io/src/lib.rs index e3f5f25b737e..3f4b7e72eb81 100644 --- a/crates/polars-io/src/lib.rs +++ b/crates/polars-io/src/lib.rs @@ -35,6 +35,7 @@ pub mod prelude; mod tests; pub(crate) mod utils; +pub mod input; #[cfg(feature = "partition")] pub mod partition; @@ -44,6 +45,7 @@ use std::path::{Path, PathBuf}; #[allow(unused)] // remove when updating to rust nightly >= 1.61 use arrow::array::new_empty_array; use arrow::error::Result as ArrowResult; +pub use input::files_async; pub use options::*; use polars_core::frame::ArrowChunk; use polars_core::prelude::*; diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index ebac468c404a..1e316e6c9279 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -71,6 +80,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + [[package]] name = "argminmax" version = "0.6.1" @@ -103,11 +118,13 @@ source = "git+https://github.com/jorgecarleitao/arrow2?rev=ba6a882bc1542b0b89977 dependencies = [ "ahash", "arrow-format", + "async-stream", "avro-schema", - "base64", + "base64 0.21.3", "bytemuck", "chrono", "chrono-tz", + "csv-async", "dyn-clone", "either", "ethnum", @@ -116,6 +133,8 @@ dependencies = [ "futures", "getrandom", "hash_hasher", + "indexmap 1.9.3", + "json-deserializer", "lexical-core", "lz4", "multiversion", @@ -130,6 +149,19 @@ dependencies = [ "zstd", ] +[[package]] +name = "async-compat" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b48b4ff0c2026db683dea961cd8ea874737f56cffca86fa84415eaddc51c00d" +dependencies = [ + "futures-core", + "futures-io", + "once_cell", + "pin-project-lite", + "tokio", +] + [[package]] name = "async-stream" version = "0.3.5" @@ -184,20 +216,61 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5281855b39aba9684d2f47bf96983fbfd8f1725f12fabb0513a8ab879647bbd" dependencies = [ + "async-stream", "crc", "fallible-streaming-iterator", + "futures", "libflate", "serde", "serde_json", "snap", ] +[[package]] +name = "backon" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c1a6197b2120bb2185a267f6515038558b019e92b832bb0320e96d66268dcf9" +dependencies = [ + "fastrand", + "futures-core", + "pin-project", + "tokio", +] + +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "414dcefbc63d77c526a76b3afcf6fbb9b5e2791c19c3aa2297733208750c6e53" +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + [[package]] name = "bitflags" version = "1.3.2" @@ -213,6 +286,15 @@ dependencies = [ "serde", ] +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "brotli" version = "3.3.4" @@ -234,6 +316,17 @@ dependencies = [ "alloc-stdlib", ] +[[package]] +name = "bstr" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "built" version = "0.6.1" @@ -271,6 +364,12 @@ dependencies = [ "syn 2.0.29", ] +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + [[package]] name = "bytes" version = "1.4.0" @@ -313,8 +412,11 @@ checksum = "f56b4c72906975ca04becb8a30e102dfecddd0c06181e3e95ddc444be28881f8" dependencies = [ "android-tzdata", "iana-time-zone", + "js-sys", "num-traits", "serde", + "time 0.1.45", + "wasm-bindgen", "windows-targets", ] @@ -388,12 +490,49 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "const-oid" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f" + +[[package]] +name = "const-random" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368a7a772ead6ce7e1de82bfb04c485f3db8ec744f72925af5735e29a22cc18e" +dependencies = [ + "const-random-macro", + "proc-macro-hack", +] + +[[package]] +name = "const-random-macro" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" +dependencies = [ + "getrandom", + "once_cell", + "proc-macro-hack", + "tiny-keccak", +] + [[package]] name = "core-foundation-sys" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "cpufeatures" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +dependencies = [ + "libc", +] + [[package]] name = "crc" version = "2.1.0" @@ -496,6 +635,84 @@ dependencies = [ "winapi", ] +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv-async" +version = "1.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71933d3f2d0481d5111cb2817b15b6961961458ec58adf8008194e6c850046f4" +dependencies = [ + "bstr", + "cfg-if", + "csv-core", + "futures", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "der" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "deranged" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946" + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "dlv-list" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8aead04dc46b5f263c25721cf25c9e595951d15055f8063f92392fa0d7f64cf4" +dependencies = [ + "const-random", +] + [[package]] name = "dyn-clone" version = "1.0.13" @@ -508,6 +725,15 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + [[package]] name = "enum_dispatch" version = "0.3.12" @@ -544,6 +770,21 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + +[[package]] +name = "flagset" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda653ca797810c02f7ca4b804b40b8b95ae046eb989d356bce17919a8c25499" + [[package]] name = "flate2" version = "1.0.27" @@ -564,6 +805,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foreign_vec" version = "0.1.0" @@ -668,6 +915,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.10" @@ -677,10 +934,16 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] +[[package]] +name = "gimli" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" + [[package]] name = "git2" version = "0.17.2" @@ -700,6 +963,25 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "h2" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91fc23aa11be92976ef4729127f1a74adf36d8436f7816b185d18df956790833" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap 1.9.3", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "1.8.2" @@ -722,6 +1004,12 @@ version = "2.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + [[package]] name = "hashbrown" version = "0.13.2" @@ -760,6 +1048,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "home" version = "0.5.5" @@ -769,6 +1066,64 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "http" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "0.14.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb1cfd654a8219eaef89881fdb3bb3b1cdc5fa75ded05d6933b2b382e395468" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.4.9", + "tokio", + "tower-service", + "tracing", + "want", +] + [[package]] name = "iana-time-zone" version = "0.1.57" @@ -802,6 +1157,16 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + [[package]] name = "indexmap" version = "2.0.0" @@ -819,12 +1184,27 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + [[package]] name = "inventory" version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1be380c410bf0595e94992a648ea89db4dd3f3354ba54af206fd2a68cf5ac8e" +[[package]] +name = "ipnet" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" + [[package]] name = "itoa" version = "1.0.9" @@ -875,6 +1255,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "json-deserializer" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f63b421e16eb4100beb677af56f0b4f3a4f08bab74ef2af079ce5bb92c2683f" +dependencies = [ + "indexmap 1.9.3", +] + [[package]] name = "jsonpath_lib" version = "0.3.0" @@ -885,6 +1274,29 @@ dependencies = [ "serde_json", ] +[[package]] +name = "jsonwebtoken" +version = "8.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" +dependencies = [ + "base64 0.21.3", + "pem", + "ring", + "serde", + "serde_json", + "simple_asn1", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +dependencies = [ + "spin", +] + [[package]] name = "lexical" version = "6.1.1" @@ -1080,6 +1492,15 @@ dependencies = [ "rawpointer", ] +[[package]] +name = "md-5" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" +dependencies = [ + "digest", +] + [[package]] name = "memchr" version = "2.6.1" @@ -1113,6 +1534,12 @@ dependencies = [ "libmimalloc-sys", ] +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "miniz_oxide" version = "0.7.1" @@ -1130,7 +1557,7 @@ checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" dependencies = [ "libc", "log", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys", ] @@ -1188,12 +1615,40 @@ dependencies = [ ] [[package]] -name = "num-complex" +name = "num-bigint" version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" dependencies = [ - "num-traits", + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand", + "smallvec", + "zeroize", +] + +[[package]] +name = "num-complex" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +dependencies = [ + "num-traits", ] [[package]] @@ -1206,6 +1661,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.16" @@ -1241,12 +1707,64 @@ dependencies = [ "rustc-hash", ] +[[package]] +name = "object" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ac5bbd07aea88c60a577a1ce218075ffd59208b2d7ca97adf9bfc5aeb21ebe" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +[[package]] +name = "opendal" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad95e460e5976ab1b74f398ab856c59f8417b3dd32202329e3491dcbe3a6b84" +dependencies = [ + "anyhow", + "async-compat", + "async-trait", + "backon", + "base64 0.21.3", + "bytes", + "chrono", + "flagset", + "futures", + "http", + "hyper", + "log", + "md-5", + "once_cell", + "parking_lot", + "percent-encoding", + "pin-project", + "quick-xml", + "reqsign", + "reqwest", + "serde", + "serde_json", + "sha2", + "tokio", + "uuid", +] + +[[package]] +name = "ordered-multimap" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ed8acf08e98e744e5384c8bc63ceb0364e68a6854187221c18df61c4797690e" +dependencies = [ + "dlv-list", + "hashbrown 0.13.2", +] + [[package]] name = "parking_lot" version = "0.12.1" @@ -1307,6 +1825,24 @@ dependencies = [ "regex", ] +[[package]] +name = "pem" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8" +dependencies = [ + "base64 0.13.1", +] + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.0" @@ -1351,6 +1887,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.29", +] + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -1363,6 +1919,27 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.27" @@ -1431,7 +2008,7 @@ dependencies = [ "comfy-table", "either", "hashbrown 0.14.0", - "indexmap", + "indexmap 2.0.0", "itoap", "ndarray", "num-traits", @@ -1467,11 +2044,13 @@ version = "0.32.0" dependencies = [ "ahash", "arrow2", + "async-trait", "bytes", "chrono", "chrono-tz", "fast-float", "flate2", + "futures", "home", "lexical", "lexical-core", @@ -1479,6 +2058,7 @@ dependencies = [ "memmap2", "num-traits", "once_cell", + "opendal", "polars-arrow", "polars-core", "polars-error", @@ -1491,6 +2071,8 @@ dependencies = [ "serde_json", "simd-json", "simdutf8", + "tokio", + "tokio-util", ] [[package]] @@ -1501,7 +2083,7 @@ dependencies = [ "arrow2", "fallible-streaming-iterator", "hashbrown 0.14.0", - "indexmap", + "indexmap 2.0.0", "num-traits", "polars-arrow", "polars-error", @@ -1538,12 +2120,12 @@ version = "0.32.0" dependencies = [ "argminmax", "arrow2", - "base64", + "base64 0.21.3", "chrono", "chrono-tz", "either", "hex", - "indexmap", + "indexmap 2.0.0", "jsonpath_lib", "memchr", "polars-arrow", @@ -1664,6 +2246,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + [[package]] name = "proc-macro2" version = "1.0.66" @@ -1767,6 +2355,16 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "quick-xml" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81b9228215d82c7b61490fec1de287136b5de6f5700f6e58ea9ad61a7964ca51" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quote" version = "1.0.33" @@ -1882,12 +2480,132 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "reqsign" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3228e570df74d69d3d3236a71371f1edd748a3e4eb728ea1f29d403bc10fc727" +dependencies = [ + "anyhow", + "async-trait", + "base64 0.21.3", + "chrono", + "form_urlencoded", + "hex", + "hmac", + "home", + "http", + "jsonwebtoken", + "log", + "once_cell", + "percent-encoding", + "quick-xml", + "rand", + "reqwest", + "rsa", + "rust-ini", + "serde", + "serde_json", + "sha1", + "sha2", + "tokio", +] + +[[package]] +name = "reqwest" +version = "0.11.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e9ad3fe7488d7e34558a2033d45a0c90b72d97b4f80705666fea71472e2e6a1" +dependencies = [ + "base64 0.21.3", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "winreg", +] + +[[package]] +name = "ring" +version = "0.16.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi", +] + [[package]] name = "rle-decode-fast" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" +[[package]] +name = "rsa" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ab43bb47d23c1a631b4b680199a45255dce26fa9ab2fa902581f624ff13e6a8" +dependencies = [ + "byteorder", + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-iter", + "num-traits", + "pkcs1", + "pkcs8", + "rand_core", + "signature", + "spki", + "subtle", + "zeroize", +] + +[[package]] +name = "rust-ini" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e2a3bcec1f113553ef1c88aae6c020a369d03d55b58de9869a0908930385091" +dependencies = [ + "cfg-if", + "ordered-multimap", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + [[package]] name = "rustc-hash" version = "1.1.0" @@ -1962,7 +2680,7 @@ version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "693151e1ac27563d6dbcec9dee9fbd5da8539b20fa14ad3752b2e6d363ace360" dependencies = [ - "indexmap", + "indexmap 2.0.0", "itoa", "ryu", "serde", @@ -1977,6 +2695,40 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha1" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "signal-hook" version = "0.3.17" @@ -2007,6 +2759,16 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e1788eed21689f9cf370582dfc467ef36ed9c707f073528ddafa8d83e3b8500" +dependencies = [ + "digest", + "rand_core", +] + [[package]] name = "simd-json" version = "0.10.6" @@ -2029,6 +2791,18 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +[[package]] +name = "simple_asn1" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" +dependencies = [ + "num-bigint", + "num-traits", + "thiserror", + "time 0.3.28", +] + [[package]] name = "siphasher" version = "0.3.11" @@ -2068,6 +2842,42 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831" +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "socket2" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + +[[package]] +name = "spki" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d1e996ef02c474957d681f1b05213dfb0abab947b446a62d37770b23500184a" +dependencies = [ + "base64ct", + "der", +] + [[package]] name = "sqlparser" version = "0.36.1" @@ -2136,6 +2946,12 @@ dependencies = [ "syn 2.0.29", ] +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + [[package]] name = "syn" version = "1.0.109" @@ -2204,6 +3020,54 @@ dependencies = [ "syn 2.0.29", ] +[[package]] +name = "time" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" +dependencies = [ + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", + "winapi", +] + +[[package]] +name = "time" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48" +dependencies = [ + "deranged", + "itoa", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] +name = "time-macros" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a942f44339478ef67935ab2bbaec2fb0322496cf3cbe84b261e06ac3814c572" +dependencies = [ + "time-core", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -2219,6 +3083,35 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokio" +version = "1.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "pin-project-lite", + "socket2 0.5.3", + "windows-sys", +] + +[[package]] +name = "tokio-util" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", + "tracing", +] + [[package]] name = "toml" version = "0.7.6" @@ -2246,13 +3139,51 @@ version = "0.19.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8123f27e969974a3dfba720fdb560be359f57b44302d280ba72e76a74480e8a" dependencies = [ - "indexmap", + "indexmap 2.0.0", "serde", "serde_spanned", "toml_datetime", "winnow", ] +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +dependencies = [ + "cfg-if", + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" + +[[package]] +name = "typenum" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + [[package]] name = "unicode-bidi" version = "0.3.13" @@ -2286,6 +3217,12 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + [[package]] name = "url" version = "2.4.1" @@ -2297,6 +3234,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "uuid" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" +dependencies = [ + "getrandom", + "serde", +] + [[package]] name = "value-trait" version = "0.6.1" @@ -2321,6 +3268,21 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -2352,6 +3314,18 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.87" @@ -2381,6 +3355,29 @@ version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +[[package]] +name = "wasm-streams" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" @@ -2487,12 +3484,28 @@ dependencies = [ "memchr", ] +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys", +] + [[package]] name = "xxhash-rust" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "735a71d46c4d68d71d4b24d03fdc2b98e38cea81730595801db779c04fe80d70" +[[package]] +name = "zeroize" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" + [[package]] name = "zstd" version = "0.12.4" From 38bb717f8ee42534fad07c5cce86f570731f77c4 Mon Sep 17 00:00:00 2001 From: Chitral Verma Date: Tue, 19 Sep 2023 01:45:54 +0530 Subject: [PATCH 2/7] FileListingUrl and opendal integration Signed-off-by: Chitral Verma --- crates/polars-error/src/lib.rs | 3 + crates/polars-io/Cargo.toml | 12 +- crates/polars-io/src/input/file_listing.rs | 435 ++++++++++++++++++ .../polars-io/src/input/files_async/avro.rs | 39 -- crates/polars-io/src/input/files_async/csv.rs | 110 ----- crates/polars-io/src/input/files_async/ipc.rs | 37 -- crates/polars-io/src/input/files_async/mod.rs | 94 ---- .../polars-io/src/input/files_async/ndjson.rs | 79 ---- .../src/input/files_async/parquet.rs | 36 -- crates/polars-io/src/input/mod.rs | 3 +- crates/polars-io/src/lib.rs | 2 +- 11 files changed, 448 insertions(+), 402 deletions(-) create mode 100644 crates/polars-io/src/input/file_listing.rs delete mode 100644 crates/polars-io/src/input/files_async/avro.rs delete mode 100644 crates/polars-io/src/input/files_async/csv.rs delete mode 100644 crates/polars-io/src/input/files_async/ipc.rs delete mode 100644 crates/polars-io/src/input/files_async/mod.rs delete mode 100644 crates/polars-io/src/input/files_async/ndjson.rs delete mode 100644 crates/polars-io/src/input/files_async/parquet.rs diff --git a/crates/polars-error/src/lib.rs b/crates/polars-error/src/lib.rs index db190d63370b..aecc48ab771f 100644 --- a/crates/polars-error/src/lib.rs +++ b/crates/polars-error/src/lib.rs @@ -67,6 +67,8 @@ pub enum PolarsError { StringCacheMismatch(ErrString), #[error("field not found: {0}")] StructFieldNotFound(ErrString), + #[error("generic error: {0}")] + Generic(Box), } impl From for PolarsError { @@ -113,6 +115,7 @@ impl PolarsError { ShapeMismatch(msg) => ShapeMismatch(func(msg).into()), StringCacheMismatch(msg) => StringCacheMismatch(func(msg).into()), StructFieldNotFound(msg) => StructFieldNotFound(func(msg).into()), + Generic(err) => ComputeError(func(&format!("IO: {err}")).into()), } } } diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 80ee7be70132..358934f4d901 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -38,14 +38,16 @@ serde = { workspace = true, features = ["derive"], optional = true } serde_json = { version = "1", default-features = false, features = ["alloc", "raw_value"], optional = true } simd-json = { workspace = true, optional = true } simdutf8 = { version = "0.1", optional = true } -tokio = { version = "1.26", features = ["net", "io-util"] } +tokio = { version = "1.26", features = ["net", "io-util", "rt-multi-thread"] } tokio-util = { version = "0.7.8", features = ["io", "io-util"] } -url = { workspace = true, optional = true } +url = { workspace = true } +glob = { version = "0.3" } +percent-encoding = { version = "2.3" } +itertools = { version = "0" } [dependencies.opendal] -git ="https://github.com/apache/incubator-opendal.git" +version = "0.40" default-features = false -tag = "v0.40.0-rc.2" features = [ "services-azblob", "services-azdls", @@ -105,7 +107,7 @@ fmt = ["polars-core/fmt"] lazy = [] parquet = ["polars-core/parquet", "arrow/io_parquet", "arrow/io_parquet_compression"] async = ["arrow/io_ipc_write_async", "polars-error/regex"] -cloud = ["object_store", "async", "url"] +cloud = ["object_store", "async" ] aws = ["object_store/aws", "cloud", "polars-core/aws"] azure = ["object_store/azure", "cloud", "polars-core/azure"] gcp = ["object_store/gcp", "cloud", "polars-core/gcp"] diff --git a/crates/polars-io/src/input/file_listing.rs b/crates/polars-io/src/input/file_listing.rs new file mode 100644 index 000000000000..b8e6df7fd617 --- /dev/null +++ b/crates/polars-io/src/input/file_listing.rs @@ -0,0 +1,435 @@ +//! This module is inspired from [arrow-datafusion](https://github.com/apache/arrow-datafusion/blob/f4c4ee1e7ffa97b089994162c3d754402f218503/datafusion/core/src/datasource/listing/url.rs) but decoupled from object_store crate in favour of opendal crate. + +use std::collections::HashMap; +use std::path::{Component, PathBuf}; + +use futures::stream::BoxStream; +use futures::{StreamExt, TryStreamExt}; +use glob::Pattern; +use itertools::Itertools; +use opendal::layers::RetryLayer; +use opendal::{ + Error as OpenDalError, ErrorKind as OpenDalErrorKind, Metadata, Metakey, Operator, Scheme, +}; +use percent_encoding; +use polars_error::{to_compute_err, PolarsError, PolarsResult}; +use url::Url; + +pub(crate) const DELIMITER: &str = "/"; + +/// A parsed URL identifying files for a listing table, see [`FileListingUrl::parse`] +/// for more information on the supported expressions +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct FileListingUrl { + /// A URL that identifies a file or directory to list files from + url: Url, + /// The path prefix + prefix: String, + /// An optional glob expression used to filter files + glob: Option, +} + +impl FileListingUrl { + /// Parse a provided string as a [[FileListingUrl]] + /// + /// # Paths without a Scheme + /// + /// If no scheme is provided, or the string is an absolute filesystem path + /// as determined [`std::path::Path::is_absolute`], the string will be + /// interpreted as a path on the local filesystem using the operating + /// system's standard path delimiter, i.e. `\` on Windows, `/` on Unix. + /// + /// If the path contains any of `'?', '*', '['`, it will be considered + /// a glob expression and resolved as described in the section below. + /// + /// Otherwise, the path will be resolved to an absolute path, returning + /// an error if it does not exist, and converted to a [file URI] + /// + /// If you wish to specify a path that does not exist on the local + /// machine you must provide it as a fully-qualified [file URI] + /// e.g. `file:///myfile.txt` + /// + /// ## Glob File Paths + /// + /// If no scheme is provided, and the path contains a glob expression, it will + /// be resolved as follows. + /// + /// The string up to the first path segment containing a glob expression will be extracted, + /// and resolved in the same manner as a normal scheme-less path. That is, resolved to + /// an absolute path on the local filesystem, returning an error if it does not exist, + /// and converted to a [file URI] + /// + /// The remaining string will be interpreted as a [`Pattern`] and used as a + /// filter when listing files from object storage + /// + /// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme + pub fn parse(s: impl AsRef) -> PolarsResult { + let s = s.as_ref(); + + // This is necessary to handle the case of a path starting with a drive letter + if std::path::Path::new(s).is_absolute() { + return Self::from_path(s); + } + + match Url::parse(s) { + Ok(mut url) => { + let (prefix, glob) = Self::parse_to_prefix(url.path())?; + url.set_path(prefix.as_str()); + Ok(Self { url, prefix, glob }) + }, + Err(url::ParseError::RelativeUrlWithoutBase) => Self::from_path(s), + Err(e) => Err(PolarsError::Generic(Box::new(e))), + } + } + + fn from_path(url_path: &str) -> PolarsResult { + let (prefix, glob) = Self::parse_to_prefix(url_path)?; + let path = std::path::Path::new(prefix.as_str()); + if path.is_dir() { + Url::from_directory_path(path) + } else { + Url::from_file_path(path) + } + .map(|mut url| { + url.set_path(prefix.as_str()); + Self { url, prefix, glob } + }) + .map_err(|_| to_compute_err(format!("Can not open path: {url_path}"))) + } + + /// Creates a new [`FileListingUrl`] interpreting `s` as a filesystem path + fn parse_to_prefix(url_path: &str) -> PolarsResult<(String, Option)> { + let (prefix, glob) = match split_glob_expression(url_path) { + Some((prefix, glob)) => { + let glob = Pattern::new(glob).map_err(|e| PolarsError::Generic(Box::new(e)))?; + (prefix, Some(glob)) + }, + None => (url_path, None), + }; + + let prefix_path = std::path::Path::new(prefix); + let normalized_prefix = normalize_path(prefix_path); + + let decoded_prefix = + percent_encoding::percent_decode_str(normalized_prefix.to_string_lossy().as_ref()) + .decode_utf8_lossy() + .to_string(); + + let suffixed = if glob.is_some() || url_path.ends_with(DELIMITER) { + decoded_prefix + DELIMITER + } else { + decoded_prefix + }; + + Ok((suffixed, glob)) + } + + /// Returns the URL scheme + pub fn url(&self) -> Url { + self.url.clone() + } + + /// Returns the URL scheme + pub fn scheme(&self) -> &str { + self.url.scheme() + } + + /// Return the prefix from which to list files + pub fn prefix(&self) -> &str { + self.prefix.as_str() + } + + /// Return the prefix from which to list files + pub fn prefix_as_path(&self) -> PathBuf { + std::path::PathBuf::from(self.prefix.as_str()) + } + + /// Return the prefix from which to list files + pub fn is_prefix_dir(&self) -> bool { + self.prefix.ends_with(DELIMITER) + } + + pub fn infer_operator(&self, opts: HashMap) -> PolarsResult { + let mut _opts = opts; + + let scheme = match (self.url.scheme(), self.url.host_str()) { + ("file", None) => Scheme::Fs, + ("az", _) => Scheme::Azblob, + ("adl" | "adls" | "abfs" | "abfss" | "azdfs" | "azdls", _) => Scheme::Azdls, + ("s3" | "s3a", Some(bucket)) => { + _opts.insert("bucket".to_string(), bucket.to_string()); + Scheme::S3 + }, + ("gs", Some(bucket)) => { + _opts.insert("bucket".to_string(), bucket.to_string()); + Scheme::Gcs + }, + (scheme, _) => { + return Err(OpenDalError::new( + OpenDalErrorKind::Unsupported, + format!("Unable to recognise URL with scheme\"{scheme}\"",).as_str(), + )) + .map_err(|e| PolarsError::Generic(Box::new(e))) + }, + }; + + let root = if self.is_prefix_dir() { + self.prefix().to_string() + } else { + let binding = self.prefix_as_path(); + let path_buf = binding.parent().and_then(|x| x.to_str()); + + match path_buf { + None => DELIMITER, + Some("") => DELIMITER, + Some(p) => p, + } + .to_string() + }; + + _opts.insert("root".to_string(), root); + + Operator::via_map(scheme, _opts) + .and_then(|op| Ok(op.layer(RetryLayer::new()))) + .map_err(|e| PolarsError::Generic(Box::new(e))) + } + + /// Returns `true` if `path` matches this [`FileListingUrl`] + pub fn contains(&self, path: &String) -> bool { + match self.strip_prefix(path) { + Some(mut segments) => match &self.glob { + Some(glob) => { + let stripped = segments.join("/"); + glob.matches(&stripped) + }, + None => true, + }, + None => false, + } + } + + /// Strips the prefix of this [`FileListingUrl`] from the provided path, returning + /// an iterator of the remaining path segments + pub(crate) fn strip_prefix<'a, 'b: 'a>( + &'a self, + path: &'b String, + ) -> Option + 'a> { + Some(path.as_str().split_terminator(DELIMITER)) + } + + /// Streams all objects identified by this [`FileListingUrl`] for the provided options + pub async fn glob_object_stream<'a>( + &'a self, + store: &'a Operator, + file_extension: &'a str, + exclude_empty: bool, + recursive: bool, + ) -> PolarsResult>> { + let stream = if self.is_prefix_dir() { + futures::stream::once( + store + .lister_with(DELIMITER) + .delimiter(if recursive { "" } else { DELIMITER }) + .metakey(Metakey::Mode | Metakey::ContentLength), + ) + .try_flatten() + .map_ok(|e| (e.path().to_string(), e.metadata().to_owned())) + .map_err(|e| PolarsError::Generic(Box::new(e))) + .boxed() + } else { + let binding = &self.prefix_as_path(); + let file_name = binding + .file_name() + .and_then(|x| x.to_str()) + .ok_or_else(|| { + to_compute_err(format!("cannot get file name from: {}", self.prefix())) + })?; + + let metadata = store + .stat(file_name) + .await + .map_err(|e| PolarsError::Generic(Box::new(e)))?; + + let object = (file_name.to_string(), metadata); + futures::stream::once(async { Ok(object) }).boxed() + } + .try_filter(move |(path, metadata)| { + let is_file = !path.ends_with(DELIMITER); + let extension_match = path.ends_with(file_extension); + let glob_match = self.contains(path); + let is_empty = exclude_empty && metadata.content_length() == 0; + + futures::future::ready(is_file && extension_match && glob_match && !is_empty) + }) + .boxed(); + + Ok(stream) + } + + /// Lists all objects identified by this [`FileListingUrl`] for the provided options + pub async fn glob_object_list<'a>( + &'a self, + store: &'a Operator, + file_extension: &'a str, + exclude_empty: bool, + recursive: bool, + ) -> PolarsResult> { + let stream = self + .glob_object_stream(store, file_extension, exclude_empty, recursive) + .await?; + let list = stream.try_collect::>().await?; + + Ok(list) + } + + /// Returns this [`FileListingUrl`] as a string + pub fn as_str(&self) -> &str { + self.url.as_str() + } +} + +impl std::fmt::Display for FileListingUrl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +const GLOB_START_CHARS: [char; 3] = ['?', '*', '[']; + +/// Splits `path` at the first path segment containing a glob expression, returning +/// `None` if no glob expression found. +/// +/// Path delimiters are determined using [`std::path::is_separator`] which +/// permits `/` as a path delimiter even on Windows platforms. +/// +fn split_glob_expression(path: &str) -> Option<(&str, &str)> { + let mut last_separator = 0; + + for (byte_idx, char) in path.char_indices() { + if GLOB_START_CHARS.contains(&char) { + if last_separator == 0 { + return Some((".", path)); + } + return Some(path.split_at(last_separator)); + } + + if std::path::is_separator(char) { + last_separator = byte_idx + char.len_utf8(); + } + } + None +} + +/// Normalize a path, removing things like `.` and `..`. +/// +/// CAUTION: This does not resolve symlinks (unlike +/// [`std::fs::canonicalize`]). This may cause incorrect or surprising +/// behavior at times. This should be used carefully. Unfortunately, +/// [`std::fs::canonicalize`] can be hard to use correctly, since it can often +/// fail, or on Windows returns annoying device paths. +fn normalize_path(path: &std::path::Path) -> PathBuf { + let mut components = path.components().peekable(); + let mut ret = if let Some(c @ Component::Prefix(..)) = components.peek().cloned() { + components.next(); + PathBuf::from(c.as_os_str()) + } else { + PathBuf::new() + }; + + for component in components { + match component { + Component::Prefix(..) => unreachable!(), + Component::RootDir => { + ret.push(component.as_os_str()); + }, + Component::CurDir => {}, + Component::ParentDir => { + ret.pop(); + }, + Component::Normal(c) => { + ret.push(c); + }, + } + } + ret +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_prefix_path() { + let root = std::env::current_dir().unwrap(); + let root = root.to_string_lossy(); + + let url = FileListingUrl::parse(root).unwrap(); + let child = String::from("partition/file"); + let prefix: Vec<_> = url.strip_prefix(&child).unwrap().collect(); + assert_eq!(prefix, vec!["partition", "file"]); + + let url = FileListingUrl::parse("file:///").unwrap(); + let child = String::from("foo/bar"); + let prefix: Vec<_> = url.strip_prefix(&child).unwrap().collect(); + assert_eq!(prefix, vec!["foo", "bar"]); + + let url = FileListingUrl::parse("file:///foo").unwrap(); + let child = String::from("/foob/bar"); + assert!(url.strip_prefix(&child).is_some()); + + let url = FileListingUrl::parse("file:///foo/file").unwrap(); + let child = String::from("foo/file"); + assert_eq!(url.strip_prefix(&child).unwrap().count(), 2); + + let url = FileListingUrl::parse("file:///foo/ bar").unwrap(); + assert_eq!(url.prefix, "/foo/ bar"); + + let url = FileListingUrl::parse("file:///foo/bar?").unwrap(); + assert_eq!(url.prefix, "/foo/bar"); + } + + #[test] + fn test_prefix_s3() { + let url = FileListingUrl::parse("s3://bucket/foo/bar/").unwrap(); + assert_eq!(url.prefix(), "/foo/bar/"); + + let child = String::from("partition/foo.parquet"); + let prefix: Vec<_> = url.strip_prefix(&child).unwrap().collect(); + assert_eq!(prefix, vec!["partition", "foo.parquet"]); + } + + #[test] + fn test_split_glob() { + fn test(input: &str, expected: Option<(&str, &str)>) { + assert_eq!( + split_glob_expression(input), + expected, + "testing split_glob_expression with {input}" + ); + } + + // no glob patterns + test("/", None); + test("/a.txt", None); + test("/a", None); + test("/a/", None); + test("/a/b", None); + test("/a/b/", None); + test("/a/b.txt", None); + test("/a/b/c.txt", None); + // glob patterns, thus we build the longest path (os-specific) + test("*.txt", Some((".", "*.txt"))); + test("/*.txt", Some(("/", "*.txt"))); + test("/a/*b.txt", Some(("/a/", "*b.txt"))); + test("/a/*/b.txt", Some(("/a/", "*/b.txt"))); + test("/a/b/[123]/file*.txt", Some(("/a/b/", "[123]/file*.txt"))); + test("/a/b*.txt", Some(("/a/", "b*.txt"))); + test("/a/b/**/c*.txt", Some(("/a/b/", "**/c*.txt"))); + + // https://github.com/apache/arrow-datafusion/issues/2465 + test( + "/a/b/c//alltypes_plain*.parquet", + Some(("/a/b/c//", "alltypes_plain*.parquet")), + ); + } +} diff --git a/crates/polars-io/src/input/files_async/avro.rs b/crates/polars-io/src/input/files_async/avro.rs deleted file mode 100644 index 1e3d24536367..000000000000 --- a/crates/polars-io/src/input/files_async/avro.rs +++ /dev/null @@ -1,39 +0,0 @@ -use async_trait::async_trait; -use opendal::Operator; -use polars_core::prelude::ArrowSchema; -use polars_error::{to_compute_err, PolarsResult}; - -use crate::input::files_async::FileFormat; - -#[derive(Debug, Default)] -pub struct AvroFormat {} - -impl AvroFormat { - /// Construct a new Format with no local overrides - pub fn new() -> Self { - Self::default() - } -} - -#[async_trait] -impl FileFormat for AvroFormat { - /// Read and parse the schema of the Avro file at location `path` - async fn fetch_schema_async( - &self, - store_op: &Operator, - path: String, - ) -> PolarsResult { - let mut reader = store_op - .reader(path.as_str()) - .await - .map_err(to_compute_err)?; - - let metadata = arrow::io::avro::avro_schema::read_async::read_metadata(&mut reader) - .await - .map_err(to_compute_err)?; - let schema = - arrow::io::avro::read::infer_schema(&metadata.record).map_err(to_compute_err)?; - - Ok(schema) - } -} diff --git a/crates/polars-io/src/input/files_async/csv.rs b/crates/polars-io/src/input/files_async/csv.rs deleted file mode 100644 index 3839d512003a..000000000000 --- a/crates/polars-io/src/input/files_async/csv.rs +++ /dev/null @@ -1,110 +0,0 @@ -use async_trait::async_trait; -use opendal::Operator; -use polars_core::prelude::ArrowSchema; -use polars_error::{to_compute_err, PolarsResult}; - -use crate::input::files_async::{FileFormat, DEFAULT_SCHEMA_INFER_MAX_RECORD}; - -#[derive(Debug)] -pub struct CSVFormat { - schema_infer_max_records: Option, - has_header: bool, - delimiter: u8, - comment_char: Option, - try_parse_dates: bool, -} - -impl Default for CSVFormat { - fn default() -> Self { - Self { - schema_infer_max_records: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD), - has_header: true, - delimiter: b',', - comment_char: None, - try_parse_dates: true, - } - } -} - -impl CSVFormat { - /// Construct a new Format with no local overrides - pub fn new() -> Self { - Self::default() - } - - /// Sets a limit in terms of records to scan to infer the schema - /// The default is `DEFAULT_SCHEMA_INFER_MAX_RECORD` - pub fn set_schema_infer_max_records(mut self, schema_infer_max_records: Option) -> Self { - self.schema_infer_max_records = schema_infer_max_records; - self - } - - /// Whether to treat the first row as a special header row. - /// The default is `true`. - pub fn set_has_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; - self - } - - /// Sets the field delimiter to use when parsing CSV. - /// - /// The default is `b','`. - pub fn set_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = delimiter; - self - } - - /// The comment character to use when parsing CSV. - /// If the start of a record begins with the byte given here, then that line is ignored by the CSV parser. - /// This is disabled by default. - pub fn set_comment_char(mut self, comment_char: Option) -> Self { - self.comment_char = comment_char; - self - } - - /// Automatically try to parse dates/ datetimes and time. - /// If parsing fails, columns remain of dtype `[DataType::Utf8]`. - /// The default is `true`. - pub fn set_try_parse_dates(mut self, try_parse_dates: bool) -> Self { - self.try_parse_dates = try_parse_dates; - self - } -} - -#[async_trait] -impl FileFormat for CSVFormat { - /// Read and parse the schema of the CSV file at location `path` - async fn fetch_schema_async( - &self, - store_op: &Operator, - path: String, - ) -> PolarsResult { - let reader = store_op - .reader(path.as_str()) - .await - .map_err(to_compute_err)?; - - let mut async_reader = arrow::io::csv::read_async::AsyncReaderBuilder::new() - .delimiter(self.delimiter) - .comment(self.comment_char) - .has_headers(self.has_header) - .create_reader(reader); - - let (fields, _) = arrow::io::csv::read_async::infer_schema( - &mut async_reader, - self.schema_infer_max_records, - self.has_header, - &|input: &[u8]| { - crate::csv::utils::infer_field_schema( - std::str::from_utf8(input).unwrap(), - self.try_parse_dates, - ) - .to_arrow() - }, - ) - .await - .map_err(to_compute_err)?; - - Ok(ArrowSchema::from(fields)) - } -} diff --git a/crates/polars-io/src/input/files_async/ipc.rs b/crates/polars-io/src/input/files_async/ipc.rs deleted file mode 100644 index 547d64f71269..000000000000 --- a/crates/polars-io/src/input/files_async/ipc.rs +++ /dev/null @@ -1,37 +0,0 @@ -use async_trait::async_trait; -use opendal::Operator; -use polars_core::prelude::ArrowSchema; -use polars_error::{to_compute_err, PolarsResult}; - -use crate::input::files_async::FileFormat; - -#[derive(Debug, Default)] -pub struct IPCFormat {} - -impl IPCFormat { - /// Construct a new Format with no local overrides - pub fn new() -> Self { - Self::default() - } -} - -#[async_trait] -impl FileFormat for IPCFormat { - /// Read and parse the schema of the IPC file at location `path` - async fn fetch_schema_async( - &self, - store_op: &Operator, - path: String, - ) -> PolarsResult { - let mut reader = store_op - .reader(path.as_str()) - .await - .map_err(to_compute_err)?; - - let metadata = arrow::io::ipc::read::file_async::read_file_metadata_async(&mut reader) - .await - .map_err(to_compute_err)?; - - Ok(metadata.schema) - } -} diff --git a/crates/polars-io/src/input/files_async/mod.rs b/crates/polars-io/src/input/files_async/mod.rs deleted file mode 100644 index 2e929d152643..000000000000 --- a/crates/polars-io/src/input/files_async/mod.rs +++ /dev/null @@ -1,94 +0,0 @@ -use async_trait::async_trait; -use futures::{StreamExt, TryStreamExt}; -use opendal::Operator; -use polars_core::prelude::{ArrowSchema, Schema}; -use polars_error::{polars_bail, polars_ensure, to_compute_err, PolarsResult}; - -#[cfg(feature = "avro")] -pub mod avro; -#[cfg(feature = "csv")] -pub mod csv; -#[cfg(any(feature = "ipc", feature = "ipc_streaming"))] -pub mod ipc; -#[cfg(feature = "json")] -pub mod ndjson; -#[cfg(feature = "parquet")] -pub mod parquet; - -pub trait FileFormatOptions {} - -/// Default max records to scan to infer the schema -const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000; - -/// The number of files to read in parallel when inferring schema -const SCHEMA_INFERENCE_CONCURRENCY: usize = 32; - -#[async_trait] -pub trait FileFormat: Send + Sync + std::fmt::Debug { - /// Infer the common schema of the provided objects. - /// For more than one file, the schema of all the files must be merge-able if `strict_schema == true` - /// or else this might fail. - /// The implementations handle whether the schema inference from either file metadata or from its content. - async fn infer_schema_async( - &self, - store_op: &Operator, - objects: Vec, - strict_schema: bool, - ) -> PolarsResult { - polars_ensure!(!objects.is_empty(), NoData: "at least one path must be provided to infer schema"); - - let schemas: Vec<_> = futures::stream::iter(objects) - .map(|object| self.fetch_schema_async(store_op, object)) - .boxed() // Workaround https://github.com/rust-lang/rust/issues/64552 - .buffered(SCHEMA_INFERENCE_CONCURRENCY) - .try_collect() - .await?; - - self.handle_schema(schemas, strict_schema) - } - - /// Read and parse the schema of the Avro file at location `path` - async fn fetch_schema_async( - &self, - store_op: &Operator, - path: String, - ) -> PolarsResult; - - // fn read( - // &self, - // n_rows: Option, - // columns: Option>, - // predicate: Option>, - // projection: Option>, - // options: O, - // ) -> PolarsResult; - // - // fn get_batches(&self) -> PolarsResult> { - // polars_bail!(ComputeError: "Functionality `get_batches` is currently not supported.") - // } - - fn handle_schema( - &self, - schemas: Vec, - strict_schema: bool, - ) -> PolarsResult { - let schema = if strict_schema { - let s = schemas - .windows(2) - .all(|a| a[0] == a[1]) - .then(|| &schemas[0]) - .ok_or(to_compute_err("Schemas of all files must match."))?; - - Schema::from_iter(s.clone().fields.iter()) - } else { - let mut default_schema = Schema::default(); - for s in schemas { - default_schema.merge(Schema::from_iter(s.fields.iter())) - } - - default_schema - }; - - Ok(schema) - } -} diff --git a/crates/polars-io/src/input/files_async/ndjson.rs b/crates/polars-io/src/input/files_async/ndjson.rs deleted file mode 100644 index aa012f16f80f..000000000000 --- a/crates/polars-io/src/input/files_async/ndjson.rs +++ /dev/null @@ -1,79 +0,0 @@ -use arrow::array::StructArray; -use async_trait::async_trait; -use opendal::Operator; -use polars_core::prelude::ArrowSchema; -use polars_error::{to_compute_err, PolarsResult}; -use tokio::io::AsyncBufReadExt; -use tokio_util::io::StreamReader; - -use crate::input::files_async::{FileFormat, DEFAULT_SCHEMA_INFER_MAX_RECORD}; - -#[derive(Debug)] -pub struct NdJSONFormat { - schema_infer_max_records: Option, -} - -impl Default for NdJSONFormat { - fn default() -> Self { - Self { - schema_infer_max_records: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD), - } - } -} - -impl NdJSONFormat { - /// Construct a new Format with no local overrides - pub fn new() -> Self { - Self::default() - } - - /// Sets a limit in terms of records to scan to infer the schema - /// The default is `DEFAULT_SCHEMA_INFER_MAX_RECORD` - pub fn set_schema_infer_max_records(mut self, schema_infer_max_records: Option) -> Self { - self.schema_infer_max_records = schema_infer_max_records; - self - } -} - -#[async_trait] -impl FileFormat for NdJSONFormat { - /// Read and parse the schema of the ndjson/ jsonl file at location `path` - async fn fetch_schema_async( - &self, - store_op: &Operator, - path: String, - ) -> PolarsResult { - let reader = store_op - .reader(path.as_str()) - .await - .map_err(to_compute_err)?; - - let mut stream_reader = StreamReader::new(reader); - - let mut line = String::new(); - let mut lines = Vec::new(); - - loop { - line.clear(); - let len = stream_reader - .read_line(&mut line) - .await - .map_err(to_compute_err)?; - if len == 0 - || lines.len() - >= self - .schema_infer_max_records - .unwrap_or(DEFAULT_SCHEMA_INFER_MAX_RECORD) - { - break; - } - - lines.push(line.clone()); - } - - let dt = arrow::io::ndjson::read::infer_iter(lines.iter()).map_err(to_compute_err)?; - let schema = ArrowSchema::from(StructArray::get_fields(&dt).to_vec()); - - Ok(schema) - } -} diff --git a/crates/polars-io/src/input/files_async/parquet.rs b/crates/polars-io/src/input/files_async/parquet.rs deleted file mode 100644 index 0aaade150587..000000000000 --- a/crates/polars-io/src/input/files_async/parquet.rs +++ /dev/null @@ -1,36 +0,0 @@ -use async_trait::async_trait; -use opendal::Operator; -use polars_core::prelude::ArrowSchema; -use polars_error::{to_compute_err, PolarsResult}; - -use crate::input::files_async::FileFormat; - -#[derive(Debug, Default)] -pub struct ParquetFormat {} - -impl ParquetFormat { - /// Construct a new Format with no local overrides - pub fn new() -> Self { - Self::default() - } -} - -#[async_trait] -impl FileFormat for ParquetFormat { - /// Read and parse the schema of the Parquet file at location `path` - async fn fetch_schema_async( - &self, - store_op: &Operator, - path: String, - ) -> PolarsResult { - let mut reader = store_op - .reader(path.as_str()) - .await - .map_err(to_compute_err)?; - - let metadata = arrow::io::parquet::read::read_metadata_async(&mut reader).await?; - let schema = arrow::io::parquet::read::infer_schema(&metadata).map_err(to_compute_err)?; - - Ok(schema) - } -} diff --git a/crates/polars-io/src/input/mod.rs b/crates/polars-io/src/input/mod.rs index 9bb646c2405b..bcbcd81eed5d 100644 --- a/crates/polars-io/src/input/mod.rs +++ b/crates/polars-io/src/input/mod.rs @@ -1 +1,2 @@ -pub mod files_async; +pub mod file_format; +pub mod file_listing; diff --git a/crates/polars-io/src/lib.rs b/crates/polars-io/src/lib.rs index 8ccd75103f84..7bea49140992 100644 --- a/crates/polars-io/src/lib.rs +++ b/crates/polars-io/src/lib.rs @@ -39,7 +39,7 @@ use std::path::{Path, PathBuf}; #[allow(unused)] // remove when updating to rust nightly >= 1.61 use arrow::array::new_empty_array; use arrow::error::Result as ArrowResult; -pub use input::files_async; +pub use input::file_format; pub use options::*; use polars_core::frame::ArrowChunk; use polars_core::prelude::*; From 4df1377d4f7fb22bcd4c87cf8af93059e80797e9 Mon Sep 17 00:00:00 2001 From: Chitral Verma Date: Fri, 22 Sep 2023 21:17:27 +0530 Subject: [PATCH 3/7] Cloud scan for parquet Signed-off-by: Chitral Verma --- crates/polars-io/src/input/file_format/mod.rs | 91 ++++++++++++ .../src/input/file_format/parquet.rs | 77 +++++++++++ crates/polars-io/src/input/file_listing.rs | 36 ++--- crates/polars-io/src/input/mod.rs | 39 ++++++ .../polars-lazy/src/frame/file_list_reader.rs | 43 ++++++ crates/polars-lazy/src/frame/parquet.rs | 130 +++++++++++++++++- 6 files changed, 392 insertions(+), 24 deletions(-) create mode 100644 crates/polars-io/src/input/file_format/mod.rs create mode 100644 crates/polars-io/src/input/file_format/parquet.rs diff --git a/crates/polars-io/src/input/file_format/mod.rs b/crates/polars-io/src/input/file_format/mod.rs new file mode 100644 index 000000000000..6dde7351103a --- /dev/null +++ b/crates/polars-io/src/input/file_format/mod.rs @@ -0,0 +1,91 @@ +use std::collections::HashMap; +use std::fmt::Debug; + +use async_trait::async_trait; +use futures::{StreamExt, TryStreamExt}; +use opendal::Operator; +use polars_core::prelude::Schema; +use polars_error::PolarsResult; + +use crate::input::file_listing::ObjectListingUrl; +use crate::input::try_blocking_io; + +#[cfg(feature = "avro")] +pub mod avro; +#[cfg(feature = "csv")] +pub mod csv; +#[cfg(any(feature = "ipc", feature = "ipc_streaming"))] +pub mod ipc; +#[cfg(feature = "json")] +pub mod ndjson; +#[cfg(feature = "parquet")] +pub mod parquet; + +pub trait FileFormatOptions {} + +/// Default max records to scan to infer the schema +const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000; + +/// The number of objects to read in parallel when inferring schema +const SCHEMA_INFERENCE_CONCURRENCY: usize = 32; + +pub type DynFileFormat = dyn FileFormat; + +#[async_trait] +pub trait FileFormat: std::fmt::Display + Send + Sync + Debug + 'static { + /// To instantiate + fn create() -> Self; + + /// Uses a size hint obtained from the reader to produce, + /// - Known row count (may or may not be known) + /// - Estimated row count (can be calculated from reader hints) + fn calculate_rows_count( + &self, + reader_size_hint: (usize, Option), + ) -> (Option, usize); + + /// Globs object info (path, schema, size_hint) for a given set of options and + /// a base [ObjectListingUrl]. + /// Operator to connect to the remote store is inferred internally. + /// + /// This is a sync API but runs the tasks for each child in an async manner internally + /// and blocks till all tasks are successfully completed. + fn glob_object_info( + &self, + listing_url: ObjectListingUrl, + cloud_opts: HashMap, + exclude_empty: bool, + recursive: bool, + ) -> PolarsResult, usize))>> { + try_blocking_io(async { + let url = listing_url.clone(); + let operator = url + .infer_operator(cloud_opts) + .expect("failed to create an operator for remote store"); + + let objects = url + .glob_object_list(&operator, "", exclude_empty, recursive) + .await + .expect("failed to glob objects from remote store"); + + let object_infos = futures::stream::iter(objects) + .map(|(path, _)| async { self.get_object_info(&operator, path).await }) + .buffer_unordered(SCHEMA_INFERENCE_CONCURRENCY) + .try_collect::>() + .await + .expect("failed to get info for one or more objects"); + + object_infos + }) + } + + /// Fetches metadata of an object from the provided `path` and returns the results as + /// object info (path, schema, size_hint). + /// + /// The [Schema] is inferred from the format specific metadata. + async fn get_object_info( + &self, + operator: &Operator, + path: String, + ) -> PolarsResult<(String, Schema, (Option, usize))>; +} diff --git a/crates/polars-io/src/input/file_format/parquet.rs b/crates/polars-io/src/input/file_format/parquet.rs new file mode 100644 index 000000000000..e6434a891f9e --- /dev/null +++ b/crates/polars-io/src/input/file_format/parquet.rs @@ -0,0 +1,77 @@ +use std::fmt::{Display, Formatter}; + +use arrow::io::parquet::read::FileMetaData; +use async_trait::async_trait; +use futures::Stream; +use opendal::Operator; +use polars_core::schema::Schema; +use polars_error::{to_compute_err, PolarsResult}; + +use crate::input::file_format::FileFormat; + +#[derive(Debug)] +pub struct ParquetFormat {} + +impl ParquetFormat { + /// Read and parse the schema of the Parquet file at location `path` + async fn fetch_metadata_async( + &self, + operator: &Operator, + path: impl AsRef, + ) -> PolarsResult<(FileMetaData, (usize, Option))> { + let mut reader = operator + .reader(path.as_ref()) + .await + .map_err(to_compute_err)?; + + let metadata = arrow::io::parquet::read::read_metadata_async(&mut reader).await?; + Ok((metadata, reader.size_hint())) + } + + /// Parses the schema of the Parquet file from FileMetadata + fn infer_schema(&self, metadata: FileMetaData) -> PolarsResult { + let arrow_schema = + arrow::io::parquet::read::infer_schema(&metadata).map_err(to_compute_err)?; + let polars_schema = Schema::from_iter(arrow_schema.clone().fields.iter()); + + Ok(polars_schema) + } +} + +impl Display for ParquetFormat { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "ParquetFormat()") + } +} + +#[async_trait] +impl FileFormat for ParquetFormat { + /// Construct a new Format with no local overrides + fn create() -> Self { + Self {} + } + + /// Uses a size hint obtained from the reader to produce, + /// - Known row count (may or may not be known) + /// - Estimated row count (can be calculated from reader hints) + fn calculate_rows_count( + &self, + _reader_size_hint: (usize, Option), + ) -> (Option, usize) { + let (estimated, known) = _reader_size_hint; + (known, estimated) + } + + async fn get_object_info( + &self, + operator: &Operator, + path: String, + ) -> PolarsResult<(String, Schema, (Option, usize))> { + let (metadata, _) = self.fetch_metadata_async(operator, path.clone()).await?; + let num_rows = &metadata.num_rows; + let size_hint = self.calculate_rows_count((*num_rows, Some(*num_rows))); + let polars_schema = self.infer_schema(metadata.clone())?; + + Ok((path.to_string(), polars_schema, size_hint)) + } +} diff --git a/crates/polars-io/src/input/file_listing.rs b/crates/polars-io/src/input/file_listing.rs index b8e6df7fd617..a70d2a95d442 100644 --- a/crates/polars-io/src/input/file_listing.rs +++ b/crates/polars-io/src/input/file_listing.rs @@ -17,10 +17,10 @@ use url::Url; pub(crate) const DELIMITER: &str = "/"; -/// A parsed URL identifying files for a listing table, see [`FileListingUrl::parse`] +/// A parsed URL identifying files for a listing table, see [`ObjectListingUrl::parse`] /// for more information on the supported expressions #[derive(Debug, Clone, Eq, PartialEq, Hash)] -pub struct FileListingUrl { +pub struct ObjectListingUrl { /// A URL that identifies a file or directory to list files from url: Url, /// The path prefix @@ -29,8 +29,8 @@ pub struct FileListingUrl { glob: Option, } -impl FileListingUrl { - /// Parse a provided string as a [[FileListingUrl]] +impl ObjectListingUrl { + /// Parse a provided string as a [[ObjectListingUrl]] /// /// # Paths without a Scheme /// @@ -97,7 +97,7 @@ impl FileListingUrl { .map_err(|_| to_compute_err(format!("Can not open path: {url_path}"))) } - /// Creates a new [`FileListingUrl`] interpreting `s` as a filesystem path + /// Creates a new [`ObjectListingUrl`] interpreting `s` as a filesystem path fn parse_to_prefix(url_path: &str) -> PolarsResult<(String, Option)> { let (prefix, glob) = match split_glob_expression(url_path) { Some((prefix, glob)) => { @@ -194,7 +194,7 @@ impl FileListingUrl { .map_err(|e| PolarsError::Generic(Box::new(e))) } - /// Returns `true` if `path` matches this [`FileListingUrl`] + /// Returns `true` if `path` matches this [`ObjectListingUrl`] pub fn contains(&self, path: &String) -> bool { match self.strip_prefix(path) { Some(mut segments) => match &self.glob { @@ -208,7 +208,7 @@ impl FileListingUrl { } } - /// Strips the prefix of this [`FileListingUrl`] from the provided path, returning + /// Strips the prefix of this [`ObjectListingUrl`] from the provided path, returning /// an iterator of the remaining path segments pub(crate) fn strip_prefix<'a, 'b: 'a>( &'a self, @@ -217,7 +217,7 @@ impl FileListingUrl { Some(path.as_str().split_terminator(DELIMITER)) } - /// Streams all objects identified by this [`FileListingUrl`] for the provided options + /// Streams all objects identified by this [`ObjectListingUrl`] for the provided options pub async fn glob_object_stream<'a>( &'a self, store: &'a Operator, @@ -266,7 +266,7 @@ impl FileListingUrl { Ok(stream) } - /// Lists all objects identified by this [`FileListingUrl`] for the provided options + /// Lists all objects identified by this [`ObjectListingUrl`] for the provided options pub async fn glob_object_list<'a>( &'a self, store: &'a Operator, @@ -282,13 +282,13 @@ impl FileListingUrl { Ok(list) } - /// Returns this [`FileListingUrl`] as a string + /// Returns this [`ObjectListingUrl`] as a string pub fn as_str(&self) -> &str { self.url.as_str() } } -impl std::fmt::Display for FileListingUrl { +impl std::fmt::Display for ObjectListingUrl { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.as_str().fmt(f) } @@ -363,34 +363,34 @@ mod tests { let root = std::env::current_dir().unwrap(); let root = root.to_string_lossy(); - let url = FileListingUrl::parse(root).unwrap(); + let url = ObjectListingUrl::parse(root).unwrap(); let child = String::from("partition/file"); let prefix: Vec<_> = url.strip_prefix(&child).unwrap().collect(); assert_eq!(prefix, vec!["partition", "file"]); - let url = FileListingUrl::parse("file:///").unwrap(); + let url = ObjectListingUrl::parse("file:///").unwrap(); let child = String::from("foo/bar"); let prefix: Vec<_> = url.strip_prefix(&child).unwrap().collect(); assert_eq!(prefix, vec!["foo", "bar"]); - let url = FileListingUrl::parse("file:///foo").unwrap(); + let url = ObjectListingUrl::parse("file:///foo").unwrap(); let child = String::from("/foob/bar"); assert!(url.strip_prefix(&child).is_some()); - let url = FileListingUrl::parse("file:///foo/file").unwrap(); + let url = ObjectListingUrl::parse("file:///foo/file").unwrap(); let child = String::from("foo/file"); assert_eq!(url.strip_prefix(&child).unwrap().count(), 2); - let url = FileListingUrl::parse("file:///foo/ bar").unwrap(); + let url = ObjectListingUrl::parse("file:///foo/ bar").unwrap(); assert_eq!(url.prefix, "/foo/ bar"); - let url = FileListingUrl::parse("file:///foo/bar?").unwrap(); + let url = ObjectListingUrl::parse("file:///foo/bar?").unwrap(); assert_eq!(url.prefix, "/foo/bar"); } #[test] fn test_prefix_s3() { - let url = FileListingUrl::parse("s3://bucket/foo/bar/").unwrap(); + let url = ObjectListingUrl::parse("s3://bucket/foo/bar/").unwrap(); assert_eq!(url.prefix(), "/foo/bar/"); let child = String::from("partition/foo.parquet"); diff --git a/crates/polars-io/src/input/mod.rs b/crates/polars-io/src/input/mod.rs index bcbcd81eed5d..e7ba67283ac2 100644 --- a/crates/polars-io/src/input/mod.rs +++ b/crates/polars-io/src/input/mod.rs @@ -1,2 +1,41 @@ +use std::future::Future; + +use futures::StreamExt; +use once_cell::sync::Lazy; +use polars_error::{polars_err, PolarsResult}; +use tokio::runtime::{Handle, Runtime}; + pub mod file_format; pub mod file_listing; + +pub(crate) static RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .map_err(|e| polars_err!(ComputeError:"failed to create async runtime {}", e)) + .unwrap() +}); + +pub(crate) fn try_blocking_io(f: F) -> PolarsResult { + let _guard = RUNTIME.enter(); + let handle = Handle::try_current().map_err(|e| { + polars_err!(ComputeError: + "failed to create current async handle {}", e) + })?; + + Ok(handle.block_on(f)) +} + +pub fn try_map_async<'a, I, O, F>(input: Vec, buffer: usize, f: F) -> PolarsResult> +where + F: Fn(I) -> O, +{ + let iter = futures::stream::iter(input); + + try_blocking_io(async { + iter.map(|v| async { f(v) }) + .buffered(buffer) + .collect::>() + .await + }) +} diff --git a/crates/polars-lazy/src/frame/file_list_reader.rs b/crates/polars-lazy/src/frame/file_list_reader.rs index 8824406f2599..6e219f4969ce 100644 --- a/crates/polars-lazy/src/frame/file_list_reader.rs +++ b/crates/polars-lazy/src/frame/file_list_reader.rs @@ -3,11 +3,13 @@ use std::path::{Path, PathBuf}; use polars_core::cloud::CloudOptions; use polars_core::error::to_compute_err; use polars_core::prelude::*; +use polars_io::input::try_map_async; use polars_io::{is_cloud_url, RowCount}; use crate::prelude::*; pub type GlobIterator = Box>>; +pub type ObjectInfo = (String, Schema, (Option, usize)); // cloud_options is used only with async feature #[allow(unused_variables)] @@ -70,6 +72,39 @@ pub trait LazyFileListReader: Clone { } } + fn finish2(self) -> PolarsResult { + let file_infos = self.clone().glob_object_infos()?; + + let lfs = try_map_async(file_infos, 32, |file_info| { + // let x = &file_info.0.as_str(); + self.clone() + .object_to_lazy(file_info) + .expect(format!("error while reading {}", "fff").as_str()) + })?; + + polars_ensure!( + !lfs.is_empty(), + ComputeError: "no matching files found in {}", self.path().display() + ); + + if lfs.len() > 1 { + let mut lf = self.concat_impl(lfs)?; + + if let Some(n_rows) = self.n_rows() { + lf = lf.slice(0, n_rows as IdxSize) + }; + + if let Some(rc) = self.row_count() { + lf = lf.with_row_count(&rc.name, Some(rc.offset)) + }; + + Ok(lf) + } else { + // unwrap because we have checked the empty condition above + Ok(lfs.into_iter().next().unwrap()) + } + } + /// Recommended concatenation of [LazyFrame]s from many input files. /// /// This method should not take into consideration [LazyFileListReader::n_rows] @@ -84,6 +119,14 @@ pub trait LazyFileListReader: Clone { /// It is recommended to always use [LazyFileListReader::finish] method. fn finish_no_glob(self) -> PolarsResult; + fn glob_object_infos(self) -> PolarsResult> { + todo!() + } + + fn object_to_lazy(self, object_info: ObjectInfo) -> PolarsResult { + todo!() + } + /// Path of the scanned file. /// It can be potentially a glob pattern. fn path(&self) -> &Path; diff --git a/crates/polars-lazy/src/frame/parquet.rs b/crates/polars-lazy/src/frame/parquet.rs index c71ed3b7821d..43d97f5ba3b8 100644 --- a/crates/polars-lazy/src/frame/parquet.rs +++ b/crates/polars-lazy/src/frame/parquet.rs @@ -1,7 +1,11 @@ +use std::collections::HashMap; use std::path::{Path, PathBuf}; use polars_core::cloud::CloudOptions; use polars_core::prelude::*; +use polars_io::input::file_format::parquet::ParquetFormat; +use polars_io::input::file_format::FileFormat; +use polars_io::input::file_listing::ObjectListingUrl; use polars_io::parquet::ParallelStrategy; use polars_io::RowCount; @@ -36,13 +40,26 @@ impl Default for ScanArgsParquet { #[derive(Clone)] struct LazyParquetReader { - args: ScanArgsParquet, path: PathBuf, + args: ScanArgsParquet, + path_str: String, } impl LazyParquetReader { fn new(path: PathBuf, args: ScanArgsParquet) -> Self { - Self { args, path } + Self { + path, + args, + path_str: String::from(""), + } + } + + fn new2(path: String, args: ScanArgsParquet) -> Self { + Self { + path: PathBuf::from(path.clone()), + args, + path_str: path, + } } } @@ -73,6 +90,67 @@ impl LazyFileListReader for LazyParquetReader { Ok(lf) } + fn glob_object_infos(self) -> PolarsResult> { + let path_str = self.path_str.as_str(); + let url = ObjectListingUrl::parse(path_str)?; + + // todo! get this from cloud options + let cloud_opts = HashMap::new(); + ParquetFormat::create().glob_object_info(url, cloud_opts, true, false) + } + + fn object_to_lazy(self, file_info: ObjectInfo) -> PolarsResult { + let (path, schema, row_estimation) = file_info; + + let file_info = FileInfo { + schema: Arc::new(schema), + row_estimation, + }; + + let row_count = self.row_count().map(|x| x.to_owned()); + + // todo! check if this is still needed + // if let Some(rc) = self.row_count() { + // let _ = schema.insert_at_index(0, rc.name.as_str().into(), IDX_DTYPE); + // } + + let options = FileScanOptions { + with_columns: None, + cache: self.args.cache, + n_rows: self.args.n_rows, + rechunk: false, + row_count: row_count.clone(), + file_counter: Default::default(), + }; + + let lpb: LogicalPlanBuilder = LogicalPlan::Scan { + path: PathBuf::from(path), + file_info, + file_options: options, + predicate: None, + scan_type: FileScan::Parquet { + options: ParquetOptions { + parallel: self.args.parallel, + low_memory: self.args.low_memory, + use_statistics: self.args.use_statistics, + }, + cloud_options: self.args.cloud_options, + }, + } + .into(); + + let mut lf: LazyFrame = lpb.build().into(); + + // todo! should this be done post concat? + // it is a bit hacky, but this row_count function updates the schema + if let Some(rc) = &row_count { + lf = lf.with_row_count(&rc.name, Some(rc.offset)) + } + + lf.opt_state.file_caching = true; + Ok(lf) + } + fn path(&self) -> &Path { self.path.as_path() } @@ -91,10 +169,6 @@ impl LazyFileListReader for LazyParquetReader { self } - fn cloud_options(&self) -> Option<&CloudOptions> { - self.args.cloud_options.as_ref() - } - fn n_rows(&self) -> Option { self.args.n_rows } @@ -102,6 +176,10 @@ impl LazyFileListReader for LazyParquetReader { fn row_count(&self) -> Option<&RowCount> { self.args.row_count.as_ref() } + + fn cloud_options(&self) -> Option<&CloudOptions> { + self.args.cloud_options.as_ref() + } } impl LazyFrame { @@ -109,4 +187,44 @@ impl LazyFrame { pub fn scan_parquet(path: impl AsRef, args: ScanArgsParquet) -> PolarsResult { LazyParquetReader::new(path.as_ref().to_owned(), args).finish() } + + /// Create a LazyFrame directly from a parquet scan. + pub fn scan_parquet2(path: &str, args: ScanArgsParquet) -> PolarsResult { + LazyParquetReader::new2(path.to_string(), args).finish2() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pq() { + let args = ScanArgsParquet { + n_rows: None, + cache: true, + row_count: Some(RowCount { + name: "serial_num".to_string(), + offset: 0, + }), + parallel: Default::default(), + ..Default::default() + }; + let x = LazyFrame::scan_parquet2( + "/Users/chitral/test_data/nw_app_activities.parquet", + args.clone(), + ) + .expect("cant make lazy frame"); + + let x2 = LazyFrame::scan_parquet( + "/Users/chitral/test_data/nw_app_activities.parquet", + args.clone(), + ) + .expect("cant make lazy frame"); + + dbg!(x.describe_optimized_plan()); + dbg!(x.schema()); + dbg!(x2.describe_optimized_plan()); + dbg!(x2.schema()); + } } From af8ddf4fc2956ee5a81b9c37a0f9005a7ea75446 Mon Sep 17 00:00:00 2001 From: Chitral Verma Date: Sat, 23 Sep 2023 01:15:45 +0530 Subject: [PATCH 4/7] remove unnecessary features for now Signed-off-by: Chitral Verma --- crates/polars-io/Cargo.toml | 5 +- py-polars/Cargo.lock | 667 +++++++++++++++++++++++++++++++++--- 2 files changed, 623 insertions(+), 49 deletions(-) diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 4ad62a139d36..dc1f76fec667 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -77,14 +77,13 @@ json = [ "serde_json", "dtype-struct", "csv", - "arrow/io_json_read", ] # support for arrows ipc file parsing -ipc = ["arrow/io_ipc", "arrow/io_ipc_compression", "arrow/io_ipc_read_async"] +ipc = ["arrow/io_ipc", "arrow/io_ipc_compression"] # support for arrows streaming ipc file parsing ipc_streaming = ["arrow/io_ipc", "arrow/io_ipc_compression"] # support for arrow avro parsing -avro = ["arrow/io_avro", "arrow/io_avro_compression", "arrow/io_avro_async"] +avro = ["arrow/io_avro", "arrow/io_avro_compression"] csv = ["lexical", "polars-core/rows", "itoa", "ryu", "fast-float", "simdutf8"] decompress = ["flate2/rust_backend"] decompress-fast = ["flate2/zlib-ng"] diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index a28528d84ebf..9c0213b6e353 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -37,9 +37,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.5" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" +checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab" dependencies = [ "memchr", ] @@ -80,6 +80,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + [[package]] name = "argminmax" version = "0.6.1" @@ -105,6 +111,19 @@ dependencies = [ "serde", ] +[[package]] +name = "async-compat" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b48b4ff0c2026db683dea961cd8ea874737f56cffca86fa84415eaddc51c00d" +dependencies = [ + "futures-core", + "futures-io", + "once_cell", + "pin-project-lite", + "tokio", +] + [[package]] name = "async-stream" version = "0.3.5" @@ -124,7 +143,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", ] [[package]] @@ -135,7 +154,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", ] [[package]] @@ -167,6 +186,18 @@ dependencies = [ "snap", ] +[[package]] +name = "backon" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c1a6197b2120bb2185a267f6515038558b019e92b832bb0320e96d66268dcf9" +dependencies = [ + "fastrand", + "futures-core", + "pin-project", + "tokio", +] + [[package]] name = "backtrace" version = "0.3.69" @@ -182,12 +213,24 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2" +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + [[package]] name = "bitflags" version = "1.3.2" @@ -203,6 +246,15 @@ dependencies = [ "serde", ] +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "brotli" version = "3.3.4" @@ -258,9 +310,15 @@ checksum = "965ab7eb5f8f97d2a083c799f3a1b994fc397b2fe2da5d1da1626ce15a39f2b1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", ] +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + [[package]] name = "bytes" version = "1.5.0" @@ -380,12 +438,49 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "const-oid" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f" + +[[package]] +name = "const-random" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368a7a772ead6ce7e1de82bfb04c485f3db8ec744f72925af5735e29a22cc18e" +dependencies = [ + "const-random-macro", + "proc-macro-hack", +] + +[[package]] +name = "const-random-macro" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" +dependencies = [ + "getrandom", + "once_cell", + "proc-macro-hack", + "tiny-keccak", +] + [[package]] name = "core-foundation-sys" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "cpufeatures" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +dependencies = [ + "libc", +] + [[package]] name = "crc" version = "2.1.0" @@ -488,6 +583,60 @@ dependencies = [ "winapi", ] +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "der" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "deranged" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946" + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "dlv-list" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8aead04dc46b5f263c25721cf25c9e595951d15055f8063f92392fa0d7f64cf4" +dependencies = [ + "const-random", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -496,9 +645,9 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "dyn-clone" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfc4744c1b8f2a09adc0e55242f60b1af195d88596bd8700be74418c056c555" +checksum = "23d2f3407d9a573d666de4b5bdf10569d73ca9478087346697dcbae6244bfbcd" [[package]] name = "either" @@ -524,7 +673,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", ] [[package]] @@ -551,6 +700,21 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + +[[package]] +name = "flagset" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a7e408202050813e6f1d9addadcaafef3dca7530c7ddfb005d4081cce6779" + [[package]] name = "flate2" version = "1.0.27" @@ -648,7 +812,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", ] [[package]] @@ -681,6 +845,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.10" @@ -788,9 +962,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" [[package]] name = "hex" @@ -798,6 +972,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "home" version = "0.5.5" @@ -945,6 +1128,15 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + [[package]] name = "inventory" version = "0.3.12" @@ -966,6 +1158,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.9" @@ -1026,6 +1227,29 @@ dependencies = [ "serde_json", ] +[[package]] +name = "jsonwebtoken" +version = "8.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" +dependencies = [ + "base64 0.21.4", + "pem", + "ring", + "serde", + "serde_json", + "simple_asn1", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +dependencies = [ + "spin", +] + [[package]] name = "lexical" version = "6.1.1" @@ -1223,14 +1447,24 @@ dependencies = [ [[package]] name = "matrixmultiply" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "090126dc04f95dc0d1c1c91f61bdd474b3930ca064c1edc8a849da2c6cbe1e77" +checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2" dependencies = [ "autocfg", "rawpointer", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "memchr" version = "2.6.3" @@ -1320,7 +1554,7 @@ dependencies = [ "ahash", "arrow-format", "avro-schema", - "base64", + "base64 0.21.4", "bytemuck", "chrono", "chrono-tz", @@ -1375,6 +1609,34 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-bigint" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand", + "smallvec", + "zeroize", +] + [[package]] name = "num-complex" version = "0.4.4" @@ -1394,6 +1656,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.16" @@ -1445,16 +1718,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d359e231e5451f4f9fa889d56e3ce34f8724f1a61db2107739359717cf2bbf08" dependencies = [ "async-trait", - "base64", + "base64 0.21.4", "bytes", "chrono", "futures", "humantime", "hyper", - "itertools", + "itertools 0.10.5", "parking_lot", "percent-encoding", - "quick-xml", + "quick-xml 0.28.2", "rand", "reqwest", "ring", @@ -1474,6 +1747,49 @@ version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +[[package]] +name = "opendal" +version = "0.40.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddba7299bab261d3ae2f37617fb7f45b19ed872752bb4e22cf93a69d979366c5" +dependencies = [ + "anyhow", + "async-compat", + "async-trait", + "backon", + "base64 0.21.4", + "bytes", + "chrono", + "flagset", + "futures", + "http", + "hyper", + "log", + "md-5", + "once_cell", + "parking_lot", + "percent-encoding", + "pin-project", + "quick-xml 0.29.0", + "reqsign", + "reqwest", + "serde", + "serde_json", + "sha2", + "tokio", + "uuid", +] + +[[package]] +name = "ordered-multimap" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ed8acf08e98e744e5384c8bc63ceb0364e68a6854187221c18df61c4797690e" +dependencies = [ + "dlv-list", + "hashbrown 0.13.2", +] + [[package]] name = "parking_lot" version = "0.12.1" @@ -1534,6 +1850,24 @@ dependencies = [ "regex", ] +[[package]] +name = "pem" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8" +dependencies = [ + "base64 0.13.1", +] + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.0" @@ -1578,6 +1912,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.37", +] + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -1590,6 +1944,27 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.27" @@ -1684,7 +2059,6 @@ name = "polars-error" version = "0.33.2" dependencies = [ "nano-arrow", - "object_store", "regex", "thiserror", ] @@ -1709,7 +2083,9 @@ dependencies = [ "fast-float", "flate2", "futures", + "glob", "home", + "itertools 0.11.0", "itoa", "lexical", "lexical-core", @@ -1719,6 +2095,8 @@ dependencies = [ "num-traits", "object_store", "once_cell", + "opendal", + "percent-encoding", "polars-arrow", "polars-core", "polars-error", @@ -1786,7 +2164,7 @@ version = "0.33.2" dependencies = [ "ahash", "argminmax", - "base64", + "base64 0.21.4", "chrono", "chrono-tz", "either", @@ -1921,6 +2299,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + [[package]] name = "proc-macro2" version = "1.0.67" @@ -2036,6 +2420,16 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81b9228215d82c7b61490fec1de287136b5de6f5700f6e58ea9ad61a7964ca51" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quote" version = "1.0.33" @@ -2093,9 +2487,9 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] name = "rayon" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" dependencies = [ "either", "rayon-core", @@ -2103,14 +2497,12 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" dependencies = [ - "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "num_cpus", ] [[package]] @@ -2151,13 +2543,44 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "reqsign" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3228e570df74d69d3d3236a71371f1edd748a3e4eb728ea1f29d403bc10fc727" +dependencies = [ + "anyhow", + "async-trait", + "base64 0.21.4", + "chrono", + "form_urlencoded", + "hex", + "hmac", + "home", + "http", + "jsonwebtoken", + "log", + "once_cell", + "percent-encoding", + "quick-xml 0.29.0", + "rand", + "reqwest", + "rsa", + "rust-ini", + "serde", + "serde_json", + "sha1", + "sha2", + "tokio", +] + [[package]] name = "reqwest" version = "0.11.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e9ad3fe7488d7e34558a2033d45a0c90b72d97b4f80705666fea71472e2e6a1" dependencies = [ - "base64", + "base64 0.21.4", "bytes", "encoding_rs", "futures-core", @@ -2213,6 +2636,38 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" +[[package]] +name = "rsa" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ab43bb47d23c1a631b4b680199a45255dce26fa9ab2fa902581f624ff13e6a8" +dependencies = [ + "byteorder", + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-iter", + "num-traits", + "pkcs1", + "pkcs8", + "rand_core", + "signature", + "spki", + "subtle", + "zeroize", +] + +[[package]] +name = "rust-ini" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e2a3bcec1f113553ef1c88aae6c020a369d03d55b58de9869a0908930385091" +dependencies = [ + "cfg-if", + "ordered-multimap", +] + [[package]] name = "rustc-demangle" version = "0.1.23" @@ -2252,14 +2707,14 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d3987094b1d07b653b7dfdc3f70ce9a1da9c51ac18c1b06b662e4f9a0e9f4b2" dependencies = [ - "base64", + "base64 0.21.4", ] [[package]] name = "rustls-webpki" -version = "0.101.5" +version = "0.101.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a27e3b59326c16e23d30aeb7a36a24cc0d29e71d68ff611cdfb4a01d013bed" +checksum = "3c7d5dece342910d9ba34d259310cae3e0154b873b35408b787b59bce53d34fe" dependencies = [ "ring", "untrusted", @@ -2334,7 +2789,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", ] [[package]] @@ -2370,6 +2825,28 @@ dependencies = [ "serde", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "signal-hook" version = "0.3.17" @@ -2400,6 +2877,16 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e1788eed21689f9cf370582dfc467ef36ed9c707f073528ddafa8d83e3b8500" +dependencies = [ + "digest", + "rand_core", +] + [[package]] name = "simd-json" version = "0.10.7" @@ -2423,6 +2910,18 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +[[package]] +name = "simple_asn1" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" +dependencies = [ + "num-bigint", + "num-traits", + "thiserror", + "time", +] + [[package]] name = "siphasher" version = "0.3.11" @@ -2440,9 +2939,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" [[package]] name = "smartstring" @@ -2510,6 +3009,16 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "spki" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d1e996ef02c474957d681f1b05213dfb0abab947b446a62d37770b23500184a" +dependencies = [ + "base64ct", + "der", +] + [[package]] name = "sqlparser" version = "0.36.1" @@ -2575,9 +3084,15 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.36", + "syn 2.0.37", ] +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + [[package]] name = "syn" version = "1.0.109" @@ -2591,9 +3106,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.36" +version = "2.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e02e55d62894af2a08aca894c6577281f76769ba47c94d5756bec8ac6e7373" +checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" dependencies = [ "proc-macro2", "quote", @@ -2643,7 +3158,44 @@ checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35" dependencies = [ "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", +] + +[[package]] +name = "time" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48" +dependencies = [ + "deranged", + "itoa", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] +name = "time-macros" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a942f44339478ef67935ab2bbaec2fb0322496cf3cbe84b261e06ac3814c572" +dependencies = [ + "time-core", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", ] [[package]] @@ -2671,6 +3223,7 @@ dependencies = [ "bytes", "libc", "mio", + "num_cpus", "pin-project-lite", "socket2 0.5.4", "tokio-macros", @@ -2685,7 +3238,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", ] [[package]] @@ -2700,9 +3253,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.8" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d" dependencies = [ "bytes", "futures-core", @@ -2772,7 +3325,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", ] [[package]] @@ -2790,6 +3343,12 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "unicode-bidi" version = "0.3.13" @@ -2813,9 +3372,9 @@ dependencies = [ [[package]] name = "unicode-width" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" [[package]] name = "unindent" @@ -2840,6 +3399,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "uuid" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" +dependencies = [ + "getrandom", + "serde", +] + [[package]] name = "value-trait" version = "0.6.1" @@ -2910,7 +3479,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", "wasm-bindgen-shared", ] @@ -2944,7 +3513,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.36", + "syn 2.0.37", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3002,9 +3571,9 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ "winapi", ] @@ -3115,6 +3684,12 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9828b178da53440fa9c766a3d2f73f7cf5d0ac1fe3980c1e5018d899fd19e07b" +[[package]] +name = "zeroize" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" + [[package]] name = "zstd" version = "0.12.4" From e39e45caf8e3e893f9cc9473e17f12155a0990da Mon Sep 17 00:00:00 2001 From: Chitral Verma Date: Sat, 23 Sep 2023 01:31:03 +0530 Subject: [PATCH 5/7] lint fix Signed-off-by: Chitral Verma --- crates/polars-io/Cargo.toml | 22 ++++++++--------- crates/polars-io/src/input/file_format/mod.rs | 24 +++++-------------- .../src/input/file_format/parquet.rs | 7 ++---- crates/polars-io/src/input/file_listing.rs | 8 +++---- crates/polars-io/src/input/mod.rs | 2 +- .../polars-lazy/src/frame/file_list_reader.rs | 5 ++-- crates/polars-lazy/src/frame/parquet.rs | 1 + py-polars/Cargo.lock | 1 + 8 files changed, 28 insertions(+), 42 deletions(-) diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index dc1f76fec667..37be278118d2 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -25,6 +25,8 @@ chrono-tz = { workspace = true, optional = true } fast-float = { version = "0.2", optional = true } flate2 = { version = "1", optional = true, default-features = false } futures = { workspace = true } +glob = { version = "0.3" } +itertools = { version = "0" } itoa = { workspace = true, optional = true } lexical = { version = "6", optional = true, default-features = false, features = ["std", "parse-integers"] } lexical-core = { workspace = true, optional = true } @@ -33,6 +35,7 @@ memmap = { package = "memmap2", version = "0.7" } num-traits = { workspace = true } object_store = { workspace = true, optional = true } once_cell = { workspace = true } +percent-encoding = { version = "2.3" } rayon = { workspace = true } regex = { workspace = true } ryu = { workspace = true, optional = true } @@ -43,21 +46,18 @@ simdutf8 = { workspace = true, optional = true } tokio = { version = "1.26", features = ["net", "io-util", "rt-multi-thread"] } tokio-util = { version = "0.7.8", features = ["io", "io-util"] } url = { workspace = true } -glob = { version = "0.3" } -percent-encoding = { version = "2.3" } -itertools = { version = "0" } [dependencies.opendal] version = "0.40" default-features = false features = [ - "services-azblob", - "services-azdls", - "services-gcs", - "services-s3", - "services-http", - "services-fs", - "services-webhdfs", + "services-azblob", + "services-azdls", + "services-gcs", + "services-s3", + "services-http", + "services-fs", + "services-webhdfs", ] [target.'cfg(not(target_family = "wasm"))'.dependencies] @@ -107,7 +107,7 @@ fmt = ["polars-core/fmt"] lazy = [] parquet = ["polars-core/parquet", "arrow/io_parquet", "arrow/io_parquet_compression"] async = ["arrow/io_ipc_write_async", "polars-error/regex"] -cloud = ["object_store", "async"] +cloud = ["object_store", "async", "polars-error/object_store"] aws = ["object_store/aws", "cloud"] azure = ["object_store/azure", "cloud"] gcp = ["object_store/gcp", "cloud"] diff --git a/crates/polars-io/src/input/file_format/mod.rs b/crates/polars-io/src/input/file_format/mod.rs index 6dde7351103a..6d5527b10fad 100644 --- a/crates/polars-io/src/input/file_format/mod.rs +++ b/crates/polars-io/src/input/file_format/mod.rs @@ -10,17 +10,11 @@ use polars_error::PolarsResult; use crate::input::file_listing::ObjectListingUrl; use crate::input::try_blocking_io; -#[cfg(feature = "avro")] -pub mod avro; -#[cfg(feature = "csv")] -pub mod csv; -#[cfg(any(feature = "ipc", feature = "ipc_streaming"))] -pub mod ipc; -#[cfg(feature = "json")] -pub mod ndjson; #[cfg(feature = "parquet")] pub mod parquet; +pub type ObjectInfo = (String, Schema, (Option, usize)); + pub trait FileFormatOptions {} /// Default max records to scan to infer the schema @@ -56,7 +50,7 @@ pub trait FileFormat: std::fmt::Display + Send + Sync + Debug + 'static { cloud_opts: HashMap, exclude_empty: bool, recursive: bool, - ) -> PolarsResult, usize))>> { + ) -> PolarsResult> { try_blocking_io(async { let url = listing_url.clone(); let operator = url @@ -68,14 +62,12 @@ pub trait FileFormat: std::fmt::Display + Send + Sync + Debug + 'static { .await .expect("failed to glob objects from remote store"); - let object_infos = futures::stream::iter(objects) + futures::stream::iter(objects) .map(|(path, _)| async { self.get_object_info(&operator, path).await }) .buffer_unordered(SCHEMA_INFERENCE_CONCURRENCY) .try_collect::>() .await - .expect("failed to get info for one or more objects"); - - object_infos + .expect("failed to get info for one or more objects") }) } @@ -83,9 +75,5 @@ pub trait FileFormat: std::fmt::Display + Send + Sync + Debug + 'static { /// object info (path, schema, size_hint). /// /// The [Schema] is inferred from the format specific metadata. - async fn get_object_info( - &self, - operator: &Operator, - path: String, - ) -> PolarsResult<(String, Schema, (Option, usize))>; + async fn get_object_info(&self, operator: &Operator, path: String) -> PolarsResult; } diff --git a/crates/polars-io/src/input/file_format/parquet.rs b/crates/polars-io/src/input/file_format/parquet.rs index e6434a891f9e..81d2ea40c693 100644 --- a/crates/polars-io/src/input/file_format/parquet.rs +++ b/crates/polars-io/src/input/file_format/parquet.rs @@ -7,6 +7,7 @@ use opendal::Operator; use polars_core::schema::Schema; use polars_error::{to_compute_err, PolarsResult}; +use crate::file_format::ObjectInfo; use crate::input::file_format::FileFormat; #[derive(Debug)] @@ -62,11 +63,7 @@ impl FileFormat for ParquetFormat { (known, estimated) } - async fn get_object_info( - &self, - operator: &Operator, - path: String, - ) -> PolarsResult<(String, Schema, (Option, usize))> { + async fn get_object_info(&self, operator: &Operator, path: String) -> PolarsResult { let (metadata, _) = self.fetch_metadata_async(operator, path.clone()).await?; let num_rows = &metadata.num_rows; let size_hint = self.calculate_rows_count((*num_rows, Some(*num_rows))); diff --git a/crates/polars-io/src/input/file_listing.rs b/crates/polars-io/src/input/file_listing.rs index a70d2a95d442..2a5d308051df 100644 --- a/crates/polars-io/src/input/file_listing.rs +++ b/crates/polars-io/src/input/file_listing.rs @@ -190,12 +190,12 @@ impl ObjectListingUrl { _opts.insert("root".to_string(), root); Operator::via_map(scheme, _opts) - .and_then(|op| Ok(op.layer(RetryLayer::new()))) + .map(|op| op.layer(RetryLayer::new())) .map_err(|e| PolarsError::Generic(Box::new(e))) } /// Returns `true` if `path` matches this [`ObjectListingUrl`] - pub fn contains(&self, path: &String) -> bool { + pub fn contains(&self, path: &str) -> bool { match self.strip_prefix(path) { Some(mut segments) => match &self.glob { Some(glob) => { @@ -212,9 +212,9 @@ impl ObjectListingUrl { /// an iterator of the remaining path segments pub(crate) fn strip_prefix<'a, 'b: 'a>( &'a self, - path: &'b String, + path: &'b str, ) -> Option + 'a> { - Some(path.as_str().split_terminator(DELIMITER)) + Some(path.split_terminator(DELIMITER)) } /// Streams all objects identified by this [`ObjectListingUrl`] for the provided options diff --git a/crates/polars-io/src/input/mod.rs b/crates/polars-io/src/input/mod.rs index e7ba67283ac2..0284d58fe984 100644 --- a/crates/polars-io/src/input/mod.rs +++ b/crates/polars-io/src/input/mod.rs @@ -26,7 +26,7 @@ pub(crate) fn try_blocking_io(f: F) -> PolarsResult { Ok(handle.block_on(f)) } -pub fn try_map_async<'a, I, O, F>(input: Vec, buffer: usize, f: F) -> PolarsResult> +pub fn try_map_async(input: Vec, buffer: usize, f: F) -> PolarsResult> where F: Fn(I) -> O, { diff --git a/crates/polars-lazy/src/frame/file_list_reader.rs b/crates/polars-lazy/src/frame/file_list_reader.rs index 85e263659173..5da3afdf6a2d 100644 --- a/crates/polars-lazy/src/frame/file_list_reader.rs +++ b/crates/polars-lazy/src/frame/file_list_reader.rs @@ -3,13 +3,12 @@ use std::path::{Path, PathBuf}; use polars_core::error::to_compute_err; use polars_core::prelude::*; use polars_io::cloud::CloudOptions; +use polars_io::file_format::ObjectInfo; use polars_io::input::try_map_async; use polars_io::{is_cloud_url, RowCount}; use crate::prelude::*; - pub type GlobIterator = Box>>; -pub type ObjectInfo = (String, Schema, (Option, usize)); // cloud_options is used only with async feature #[allow(unused_variables)] @@ -123,7 +122,7 @@ pub trait LazyFileListReader: Clone { todo!() } - fn object_to_lazy(self, object_info: ObjectInfo) -> PolarsResult { + fn object_to_lazy(self, _object_info: ObjectInfo) -> PolarsResult { todo!() } diff --git a/crates/polars-lazy/src/frame/parquet.rs b/crates/polars-lazy/src/frame/parquet.rs index 2978324e776d..834ed98720e9 100644 --- a/crates/polars-lazy/src/frame/parquet.rs +++ b/crates/polars-lazy/src/frame/parquet.rs @@ -3,6 +3,7 @@ use std::path::{Path, PathBuf}; use polars_core::prelude::*; use polars_io::cloud::CloudOptions; +use polars_io::file_format::ObjectInfo; use polars_io::input::file_format::parquet::ParquetFormat; use polars_io::input::file_format::FileFormat; use polars_io::input::file_listing::ObjectListingUrl; diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 9c0213b6e353..016e521c99fb 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -2059,6 +2059,7 @@ name = "polars-error" version = "0.33.2" dependencies = [ "nano-arrow", + "object_store", "regex", "thiserror", ] From b709733136e32f777b2dae3100028be53a8732fa Mon Sep 17 00:00:00 2001 From: Chitral Verma Date: Sat, 23 Sep 2023 01:43:06 +0530 Subject: [PATCH 6/7] Add remaining eager side abstractions Signed-off-by: Chitral Verma --- crates/polars-io/src/input/file_format/mod.rs | 24 ++++++++++++++----- crates/polars-io/src/input/file_listing.rs | 11 ++++++--- crates/polars-lazy/src/frame/parquet.rs | 3 +-- py-polars/src/error.rs | 2 ++ 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/crates/polars-io/src/input/file_format/mod.rs b/crates/polars-io/src/input/file_format/mod.rs index 6d5527b10fad..1e6bb7ee8d2b 100644 --- a/crates/polars-io/src/input/file_format/mod.rs +++ b/crates/polars-io/src/input/file_format/mod.rs @@ -1,14 +1,15 @@ -use std::collections::HashMap; use std::fmt::Debug; +use std::sync::Arc; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; use opendal::Operator; -use polars_core::prelude::Schema; +use polars_core::prelude::{DataFrame, PlHashMap, Schema}; use polars_error::PolarsResult; use crate::input::file_listing::ObjectListingUrl; use crate::input::try_blocking_io; +use crate::predicates::PhysicalIoExpr; #[cfg(feature = "parquet")] pub mod parquet; @@ -17,9 +18,6 @@ pub type ObjectInfo = (String, Schema, (Option, usize)); pub trait FileFormatOptions {} -/// Default max records to scan to infer the schema -const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000; - /// The number of objects to read in parallel when inferring schema const SCHEMA_INFERENCE_CONCURRENCY: usize = 32; @@ -47,7 +45,7 @@ pub trait FileFormat: std::fmt::Display + Send + Sync + Debug + 'static { fn glob_object_info( &self, listing_url: ObjectListingUrl, - cloud_opts: HashMap, + cloud_opts: PlHashMap, exclude_empty: bool, recursive: bool, ) -> PolarsResult> { @@ -76,4 +74,18 @@ pub trait FileFormat: std::fmt::Display + Send + Sync + Debug + 'static { /// /// The [Schema] is inferred from the format specific metadata. async fn get_object_info(&self, operator: &Operator, path: String) -> PolarsResult; + + fn finish_read( + &self, + _n_rows: Option, + _columns: Option>, + _predicate: Option>, + _projection: Option>, + ) -> PolarsResult { + todo!() + } + + fn get_batches(&self) -> PolarsResult> { + todo!() + } } diff --git a/crates/polars-io/src/input/file_listing.rs b/crates/polars-io/src/input/file_listing.rs index 2a5d308051df..26ecc3b9959c 100644 --- a/crates/polars-io/src/input/file_listing.rs +++ b/crates/polars-io/src/input/file_listing.rs @@ -1,6 +1,5 @@ //! This module is inspired from [arrow-datafusion](https://github.com/apache/arrow-datafusion/blob/f4c4ee1e7ffa97b089994162c3d754402f218503/datafusion/core/src/datasource/listing/url.rs) but decoupled from object_store crate in favour of opendal crate. -use std::collections::HashMap; use std::path::{Component, PathBuf}; use futures::stream::BoxStream; @@ -12,6 +11,7 @@ use opendal::{ Error as OpenDalError, ErrorKind as OpenDalErrorKind, Metadata, Metakey, Operator, Scheme, }; use percent_encoding; +use polars_core::prelude::PlHashMap; use polars_error::{to_compute_err, PolarsError, PolarsResult}; use url::Url; @@ -149,8 +149,9 @@ impl ObjectListingUrl { self.prefix.ends_with(DELIMITER) } - pub fn infer_operator(&self, opts: HashMap) -> PolarsResult { - let mut _opts = opts; + #[allow(clippy::disallowed_types)] + pub fn infer_operator(&self, opts: PlHashMap) -> PolarsResult { + let mut _opts = std::collections::HashMap::new(); let scheme = match (self.url.scheme(), self.url.host_str()) { ("file", None) => Scheme::Fs, @@ -189,6 +190,10 @@ impl ObjectListingUrl { _opts.insert("root".to_string(), root); + for (key, value) in opts { + _opts.insert(key, value); + } + Operator::via_map(scheme, _opts) .map(|op| op.layer(RetryLayer::new())) .map_err(|e| PolarsError::Generic(Box::new(e))) diff --git a/crates/polars-lazy/src/frame/parquet.rs b/crates/polars-lazy/src/frame/parquet.rs index 834ed98720e9..34514c281872 100644 --- a/crates/polars-lazy/src/frame/parquet.rs +++ b/crates/polars-lazy/src/frame/parquet.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::path::{Path, PathBuf}; use polars_core::prelude::*; @@ -96,7 +95,7 @@ impl LazyFileListReader for LazyParquetReader { let url = ObjectListingUrl::parse(path_str)?; // todo! get this from cloud options - let cloud_opts = HashMap::new(); + let cloud_opts = PlHashMap::new(); ParquetFormat::create().glob_object_info(url, cloud_opts, true, false) } diff --git a/py-polars/src/error.rs b/py-polars/src/error.rs index 6e017bf566b7..9e521e0cf105 100644 --- a/py-polars/src/error.rs +++ b/py-polars/src/error.rs @@ -52,6 +52,7 @@ impl std::convert::From for PyErr { PolarsError::StructFieldNotFound(name) => { StructFieldNotFoundError::new_err(name.to_string()) }, + PolarsError::Generic(err) => GenericError::new_err(format!("{err:?}")), }, Arrow(err) => ArrowErrorException::new_err(format!("{err:?}")), _ => default(), @@ -82,6 +83,7 @@ create_exception!(exceptions, SchemaFieldNotFoundError, PyException); create_exception!(exceptions, ShapeError, PyException); create_exception!(exceptions, StringCacheMismatchError, PyException); create_exception!(exceptions, StructFieldNotFoundError, PyException); +create_exception!(exceptions, GenericError, PyException); #[macro_export] macro_rules! raise_err( From 968056aff9a436c70cde3a45a4f22d8d6a1b1e6d Mon Sep 17 00:00:00 2001 From: Chitral Verma Date: Sat, 23 Sep 2023 02:41:03 +0530 Subject: [PATCH 7/7] lint fix Signed-off-by: Chitral Verma --- crates/polars-io/src/input/file_listing.rs | 22 ++++++++++++++----- .../polars-lazy/src/frame/file_list_reader.rs | 6 ++--- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/crates/polars-io/src/input/file_listing.rs b/crates/polars-io/src/input/file_listing.rs index 26ecc3b9959c..2075b0a76025 100644 --- a/crates/polars-io/src/input/file_listing.rs +++ b/crates/polars-io/src/input/file_listing.rs @@ -1,6 +1,6 @@ //! This module is inspired from [arrow-datafusion](https://github.com/apache/arrow-datafusion/blob/f4c4ee1e7ffa97b089994162c3d754402f218503/datafusion/core/src/datasource/listing/url.rs) but decoupled from object_store crate in favour of opendal crate. -use std::path::{Component, PathBuf}; +use std::path::{Component, PathBuf, MAIN_SEPARATOR_STR}; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; @@ -15,7 +15,7 @@ use polars_core::prelude::PlHashMap; use polars_error::{to_compute_err, PolarsError, PolarsResult}; use url::Url; -pub(crate) const DELIMITER: &str = "/"; +pub(crate) const DELIMITER: &str = MAIN_SEPARATOR_STR; /// A parsed URL identifying files for a listing table, see [`ObjectListingUrl::parse`] /// for more information on the supported expressions @@ -387,16 +387,28 @@ mod tests { assert_eq!(url.strip_prefix(&child).unwrap().count(), 2); let url = ObjectListingUrl::parse("file:///foo/ bar").unwrap(); - assert_eq!(url.prefix, "/foo/ bar"); + assert_eq!( + url.prefix, + format!("{}foo{} bar", MAIN_SEPARATOR_STR, MAIN_SEPARATOR_STR) + ); let url = ObjectListingUrl::parse("file:///foo/bar?").unwrap(); - assert_eq!(url.prefix, "/foo/bar"); + assert_eq!( + url.prefix, + format!("{}foo{}bar", MAIN_SEPARATOR_STR, MAIN_SEPARATOR_STR) + ); } #[test] fn test_prefix_s3() { let url = ObjectListingUrl::parse("s3://bucket/foo/bar/").unwrap(); - assert_eq!(url.prefix(), "/foo/bar/"); + assert_eq!( + url.prefix(), + format!( + "{}foo{}bar{}", + MAIN_SEPARATOR_STR, MAIN_SEPARATOR_STR, MAIN_SEPARATOR_STR + ) + ); let child = String::from("partition/foo.parquet"); let prefix: Vec<_> = url.strip_prefix(&child).unwrap().collect(); diff --git a/crates/polars-lazy/src/frame/file_list_reader.rs b/crates/polars-lazy/src/frame/file_list_reader.rs index 5da3afdf6a2d..d09e6e5713ae 100644 --- a/crates/polars-lazy/src/frame/file_list_reader.rs +++ b/crates/polars-lazy/src/frame/file_list_reader.rs @@ -75,10 +75,10 @@ pub trait LazyFileListReader: Clone { let file_infos = self.clone().glob_object_infos()?; let lfs = try_map_async(file_infos, 32, |file_info| { - // let x = &file_info.0.as_str(); + let (path, ..) = &file_info; self.clone() - .object_to_lazy(file_info) - .expect(format!("error while reading {}", "fff").as_str()) + .object_to_lazy(file_info.clone()) + .unwrap_or_else(|_| panic!("error while reading {}", &path)) })?; polars_ensure!(