diff --git a/Cargo.lock b/Cargo.lock index 37c39134..8dbf614d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -63,6 +63,20 @@ dependencies = [ "version_check", ] +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.2.15", + "once_cell", + "version_check", + "zerocopy 0.7.35", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -72,6 +86,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android-tzdata" version = "0.1.1" @@ -143,6 +178,28 @@ version = "1.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b964d184e89d9b6b67dd2715bc8e74cf3107fb2b529990c90cf517326150bf4" +[[package]] +name = "api" +version = "0.0.1" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "datafusion", + "ecstore", + "futures", + "futures-core", + "http", + "object_store", + "s3s", + "snafu", + "tokio", + "tokio-util", + "tracing", + "transform-stream", + "url", +] + [[package]] name = "arc-swap" version = "1.7.1" @@ -161,12 +218,227 @@ dependencies = [ "password-hash", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + [[package]] name = "arrayvec" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "arrow" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc208515aa0151028e464cc94a692156e945ce5126abd3537bb7fd6ba2143ed1" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e07e726e2b3f7816a85c6a45b6ec118eeeabf0b2a8c208122ad949437181f49a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2262eba4f16c78496adfd559a29fe4b24df6088efc9985a873d58e92be022d5" +dependencies = [ + "ahash 0.8.11", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.15.2", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e899dade2c3b7f5642eb8366cfd898958bcca099cde6dfea543c7e8d3ad88d4" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4103d88c5b441525ed4ac23153be7458494c2b0c9a11115848fdb9b81f6f886a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d3cb0914486a3cae19a5cad2598e44e225d53157926d0ada03c20521191a65" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "regex", +] + +[[package]] +name = "arrow-data" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a329fb064477c9ec5f0870d2f5130966f91055c7c5bce2b3a084f116bc28c3b" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddecdeab02491b1ce88885986e25002a3da34dd349f682c7cfe67bab7cc17b86" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "flatbuffers", + "lz4_flex", +] + +[[package]] +name = "arrow-json" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d03b9340013413eb84868682ace00a1098c81a5ebc96d279f7ebf9a4cac3c0fd" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.7.1", + "lexical-core", + "num", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-ord" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f841bfcc1997ef6ac48ee0305c4dfceb1f7c786fe31e67c1186edf775e1f1160" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1eeb55b0a0a83851aa01f2ca5ee5648f607e8506ba6802577afdda9d75cdedcd" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85934a9d0261e0fa5d4e2a5295107d743b543a6e0484a835d4b8db2da15306f9" + +[[package]] +name = "arrow-select" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e2932aece2d0c869dd2125feb9bd1709ef5c445daa3838ac4112dcfa0fda52c" +dependencies = [ + "ahash 0.8.11", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "912e38bd6a7a7714c1d9b61df80315685553b7455e8a6045c27531d8ecd5b458" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax 0.8.5", +] + [[package]] name = "ashpd" version = "0.8.1" @@ -229,6 +501,23 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "async-compression" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +dependencies = [ + "bzip2", + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", +] + [[package]] name = "async-io" version = "2.4.0" @@ -493,6 +782,19 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bigdecimal" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f31f3af01c5c65a07985c804d3366560e6fa7883d640a122819b14ec327482c" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -517,6 +819,19 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "blake3" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "675f87afced0413c9bb02843499dbbd3882a237645883f71a2b59644a6d2f753" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + [[package]] name = "block" version = "0.1.6" @@ -572,6 +887,27 @@ dependencies = [ "piper", ] +[[package]] +name = "brotli" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "4.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bumpalo" version = "3.17.0" @@ -605,6 +941,25 @@ dependencies = [ "bytes", ] +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cairo-rs" version = "0.18.5" @@ -636,6 +991,8 @@ version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -704,9 +1061,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.40" +version = "0.4.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" dependencies = [ "android-tzdata", "iana-time-zone", @@ -714,7 +1071,28 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-link", + "windows-targets 0.52.6", +] + +[[package]] +name = "chrono-tz" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf 0.11.3", +] + +[[package]] +name = "chrono-tz-build" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7" +dependencies = [ + "parse-zoneinfo", + "phf_codegen 0.11.3", ] [[package]] @@ -871,6 +1249,16 @@ dependencies = [ "memchr", ] +[[package]] +name = "comfy-table" +version = "7.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" +dependencies = [ + "unicode-segmentation", + "unicode-width", +] + [[package]] name = "common" version = "0.0.1" @@ -908,6 +1296,26 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1cb3c4a0d3776f7535c32793be81d6d5fec0d48ac70955d9834e643aa249a52f" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.15", + "once_cell", + "tiny-keccak", +] + [[package]] name = "const-serialize" version = "0.6.2" @@ -969,6 +1377,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "convert_case" version = "0.4.0" @@ -1173,6 +1587,27 @@ dependencies = [ "syn 2.0.98", ] +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa 1.0.14", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +dependencies = [ + "memchr", +] + [[package]] name = "ctr" version = "0.9.2" @@ -1202,6 +1637,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", + "strsim", "syn 2.0.98", ] @@ -1229,12 +1665,505 @@ dependencies = [ "parking_lot_core 0.9.10", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core 0.9.10", +] + [[package]] name = "data-encoding" version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "575f75dfd25738df5b91b8e43e14d44bda14637a58fae779fd2b064f8bf3e010" +[[package]] +name = "datafusion" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46b879c1aa3a85ecbfa376704f0fe4bfebae1a44a5d35faa4466bf85469b6a0e" +dependencies = [ + "arrow", + "arrow-ipc", + "arrow-schema", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-macros", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-sql", + "flate2", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot 0.12.3", + "parquet", + "rand 0.8.5", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e42f516243fe30137f2b7d5712611286baf8d1d758a46157bada7c35fdf38df" +dependencies = [ + "arrow", + "async-trait", + "dashmap 6.1.0", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "parking_lot 0.12.3", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e495290c231d617f0a940860a885cb2f4c3efe46c1983c30d3fa12faf1ccb208" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "log", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-common" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af67ddc82e1c8e6843c326ca13aa20e5420cce9f886b4e1ee39ea43defae3145" +dependencies = [ + "ahash 0.8.11", + "arrow", + "arrow-ipc", + "base64", + "half", + "hashbrown 0.14.5", + "indexmap 2.7.1", + "libc", + "log", + "object_store", + "parquet", + "paste", + "recursive", + "sqlparser", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36ee9403a2ec39183437825d232f556a5dee89f13f6fd78f8c7f8f999489e4ca" +dependencies = [ + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8c8b7568b638dd309bcc1cdeb66776f233b110d44bdc6fd67ef1919f9ec9803" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "rand 0.8.5", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-doc" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8612c81304578a2e2b82d31caf8173312cb086a7a23a23556b9fff3ac7c18221" + +[[package]] +name = "datafusion-execution" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3591e6d4900e57bad7f861f14f5c763f716da76553b0d037ec91c192c876f09c" +dependencies = [ + "arrow", + "dashmap 6.1.0", + "datafusion-common", + "datafusion-expr", + "futures", + "log", + "object_store", + "parking_lot 0.12.3", + "rand 0.8.5", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033d0f6198d177f50a7721d80db141af15dd12f45ad6dce34e2cdbb6538e39d" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap 2.7.1", + "paste", + "recursive", + "serde_json", + "sqlparser", +] + +[[package]] +name = "datafusion-expr-common" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56def48a7dfb9f92aa18e18dfdffaca79b5383f03c59bb0107959c1698634557" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap 2.7.1", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-functions" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a79b703b42b0aac97485b84c6810c78114b0974a75a33514840ba0bbe0de38f" +dependencies = [ + "arrow", + "arrow-buffer", + "base64", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "rand 0.8.5", + "regex", + "sha2 0.10.8", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdad20375e85365ed262b5583955c308840efc6ff9271ff463cf86789adfb686" +dependencies = [ + "ahash 0.8.11", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff73249ee3cdc81ad04317d3b4231fc02a8c03a3a1b4b13953244e6443f6b498" +dependencies = [ + "ahash 0.8.11", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20dcd70c58f17b7ce937866e43c75293a3250aadc1db830ad6d502967aaffb40" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-macros", + "datafusion-physical-expr-common", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac12628c3e43461118e95d5772f729e1cc39db883d8ee52e4b80038b0f614bbf" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot 0.12.3", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03eb449555c7cc03bb61d43d90edef70d070d34bc4a0d8f7e358d157232f3220" +dependencies = [ + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a0c7606e568ee6a15d33a2532eb0d18e7769bb88af55f6b70be4db9fd937d18" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-macros" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64030e805d3d257e3012e4378500d4ac90b1ebacd03f1110e8ec927b77f09486" +dependencies = [ + "datafusion-expr", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "datafusion-optimizer" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae6af7bdae7565aa7a4cb1deb7fe18d89c63c5d93b5203b473ca1dbe02a1cd3d" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "indexmap 2.7.1", + "itertools 0.14.0", + "log", + "recursive", + "regex", + "regex-syntax 0.8.5", +] + +[[package]] +name = "datafusion-physical-expr" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f68601feda90c255c9023a881e833efca9d7539bab0565ac1355b0249326e91" +dependencies = [ + "ahash 0.8.11", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.14.5", + "indexmap 2.7.1", + "itertools 0.14.0", + "log", + "paste", + "petgraph", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00c1a08b00d340ca3bc1cd2f094ecaeaf6f099a2980e11255976660fa0409182" +dependencies = [ + "ahash 0.8.11", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.14.5", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd34f3438cf9629ea0e3425027582334fb6671a05ee43671ca3c47896b75dda" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools 0.14.0", + "log", + "recursive", +] + +[[package]] +name = "datafusion-physical-plan" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7624484ada341d30ef465eae61f760e779f080c621bbc3dc0335a75fa78e8dec" +dependencies = [ + "ahash 0.8.11", + "arrow", + "arrow-ord", + "arrow-schema", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.14.5", + "indexmap 2.7.1", + "itertools 0.14.0", + "log", + "parking_lot 0.12.3", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-sql" +version = "46.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e717736a394ed92d9dcf2d74439c655474dd39aa65a064a6bae697b6d20e5fe" +dependencies = [ + "arrow", + "bigdecimal", + "datafusion-common", + "datafusion-expr", + "indexmap 2.7.1", + "log", + "recursive", + "regex", + "sqlparser", +] + [[package]] name = "dbus" version = "0.9.7" @@ -1269,6 +2198,37 @@ dependencies = [ "serde", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.98", +] + [[package]] name = "derive_more" version = "0.99.19" @@ -2686,6 +3646,7 @@ checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ "cfg-if", "crunchy", + "num-traits", ] [[package]] @@ -2694,7 +3655,7 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" dependencies = [ - "ahash", + "ahash 0.7.8", ] [[package]] @@ -2702,6 +3663,10 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash 0.8.11", + "allocator-api2", +] [[package]] name = "hashbrown" @@ -2940,7 +3905,7 @@ dependencies = [ "ecstore", "futures", "ipnetwork", - "itertools", + "itertools 0.14.0", "jsonwebtoken", "lazy_static", "log", @@ -3174,6 +4139,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "ipnet" version = "2.11.0" @@ -3201,6 +4172,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -3267,6 +4247,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.77" @@ -3342,6 +4331,70 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lexical-core" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +dependencies = [ + "lexical-util", + "static_assertions", +] + [[package]] name = "libappindicator" version = "0.9.0" @@ -3511,6 +4564,26 @@ dependencies = [ "hashbrown 0.12.3", ] +[[package]] +name = "lz4_flex" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "mac" version = "0.1.1" @@ -3925,6 +4998,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -4174,6 +5248,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "humantime", + "itertools 0.13.0", + "parking_lot 0.12.3", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + [[package]] name = "once_cell" version = "1.20.3" @@ -4192,6 +5287,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + [[package]] name = "ordered-stream" version = "0.2.0" @@ -4293,6 +5397,52 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "parquet" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f88838dca3b84d41444a0341b19f347e8098a3898b0f21536654b8b799e11abd" +dependencies = [ + "ahash 0.8.11", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.15.2", + "lz4_flex", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", + "zstd-sys", +] + +[[package]] +name = "parse-zoneinfo" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24" +dependencies = [ + "regex", +] + [[package]] name = "password-hash" version = "0.5.0" @@ -4390,6 +5540,15 @@ dependencies = [ "phf_shared 0.10.0", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_shared 0.11.3", +] + [[package]] name = "phf_codegen" version = "0.8.0" @@ -4410,6 +5569,16 @@ dependencies = [ "phf_shared 0.10.0", ] +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + [[package]] name = "phf_generator" version = "0.8.0" @@ -4554,7 +5723,7 @@ dependencies = [ "crypto", "futures", "ipnetwork", - "itertools", + "itertools 0.14.0", "jsonwebtoken", "lazy_static", "log", @@ -4748,7 +5917,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck 0.5.0", - "itertools", + "itertools 0.14.0", "log", "multimap", "once_cell", @@ -4768,7 +5937,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.98", @@ -4818,6 +5987,32 @@ dependencies = [ "tower 0.5.2", ] +[[package]] +name = "psm" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" +dependencies = [ + "cc", +] + +[[package]] +name = "query" +version = "0.0.1" +dependencies = [ + "api", + "async-recursion", + "async-trait", + "datafusion", + "derive_builder", + "futures", + "parking_lot 0.12.3", + "s3s", + "snafu", + "tokio", + "tracing", +] + [[package]] name = "quick-xml" version = "0.37.2" @@ -4877,7 +6072,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5012,6 +6207,26 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20675572f6f24e9e76ef639bc5552774ed45f1c30e2951e1e99c59888861c539" +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.98", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -5310,6 +6525,7 @@ dependencies = [ name = "rustfs" version = "0.1.0" dependencies = [ + "api", "async-trait", "atoi", "axum", @@ -5319,6 +6535,8 @@ dependencies = [ "common", "const-str", "crypto", + "csv", + "datafusion", "ecstore", "flatbuffers", "futures", @@ -5346,6 +6564,7 @@ dependencies = [ "prost-types", "protobuf", "protos", + "query", "rmp-serde", "rust-embed", "s3s", @@ -5610,6 +6829,12 @@ dependencies = [ "futures-core", ] +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.218" @@ -5714,7 +6939,7 @@ checksum = "4fae7a3038a32e5a34ba32c6c45eb4852f8affaf8b794ebfcd4b1099e2d62ebe" dependencies = [ "bytes", "const_format", - "dashmap", + "dashmap 5.5.3", "futures", "gloo-net", "http", @@ -5957,6 +7182,34 @@ version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" +[[package]] +name = "snafu" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" +dependencies = [ + "backtrace", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.5.8" @@ -5999,12 +7252,47 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "sqlparser" +version = "0.54.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stacker" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9156ebd5870ef293bfb43f91c7a74528d363ec0d424afe24160ed5a4343d08a" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -6327,6 +7615,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + [[package]] name = "time" version = "0.3.37" @@ -6360,6 +7659,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinystr" version = "0.7.6" @@ -6790,6 +8098,16 @@ dependencies = [ "utf-8", ] +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + [[package]] name = "typenum" version = "1.18.0" @@ -6825,6 +8143,12 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + [[package]] name = "unicode-xid" version = "0.2.6" @@ -6896,9 +8220,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" dependencies = [ "getrandom 0.3.1", + "js-sys", "rand 0.9.0", "serde", "uuid-macro-internal", + "wasm-bindgen", ] [[package]] @@ -7737,6 +9063,15 @@ version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yoke" version = "0.7.5" @@ -7965,6 +9300,34 @@ dependencies = [ "syn 2.0.98", ] +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.13+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "zvariant" version = "4.2.0" diff --git a/Cargo.toml b/Cargo.toml index 9e3e8061..adae8650 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,8 @@ members = [ "iam", # Identity and Access Management "crypto", # Cryptography and security features "cli/rustfs-gui", # Graphical user interface client + "s3select/api", + "s3select/query", ] resolver = "2" @@ -29,12 +31,15 @@ all = "warn" [workspace.dependencies] madmin = { path = "./madmin" } +async-recursion = "1.0.5" async-trait = "0.1.86" backon = "1.3.0" bytes = "1.9.0" bytesize = "1.3.0" -chrono = { version = "0.4.40", features = ["serde"] } +chrono = { version = "0.4.39", features = ["serde"] } clap = { version = "4.5.31", features = ["derive", "env"] } +datafusion = "46.0.0" +derive_builder = "0.20.2" dioxus = { version = "0.6.3", features = ["router"] } dirs = "6.0.0" ecstore = { path = "./ecstore" } @@ -94,6 +99,7 @@ tonic = { version = "0.12.3", features = ["gzip"] } tonic-build = "0.12.3" tonic-reflection = "0.12" tokio-stream = "0.1.17" +tokio-util = { version = "0.7.13", features = ["io", "compat"] } tower = { version = "0.5.2", features = ["timeout"] } tracing = "0.1.41" tracing-error = "0.2.1" @@ -112,7 +118,7 @@ md-5 = "0.10.6" workers = { path = "./common/workers" } test-case = "3.3.1" zip = "2.2.3" - +snafu = "0.8.5" [profile.wasm-dev] diff --git a/ecstore/src/utils/os/linux.rs b/ecstore/src/utils/os/linux.rs index ca64f818..064b74ae 100644 --- a/ecstore/src/utils/os/linux.rs +++ b/ecstore/src/utils/os/linux.rs @@ -151,7 +151,7 @@ fn read_drive_stats(stats_file: &str) -> Result { fn read_stat(file_name: &str) -> Result> { // 打开文件 let path = Path::new(file_name); - let file = File::open(&path)?; + let file = File::open(path)?; // 创建一个 BufReader let reader = io::BufReader::new(file); @@ -161,7 +161,8 @@ fn read_stat(file_name: &str) -> Result> { if let Some(line) = reader.lines().next() { let line = line?; // 分割行并解析为 u64 - for token in line.trim().split_whitespace() { + // https://rust-lang.github.io/rust-clippy/master/index.html#trim_split_whitespace + for token in line.split_whitespace() { let ui64: u64 = token.parse()?; stats.push(ui64); } diff --git a/rustfs/Cargo.toml b/rustfs/Cargo.toml index f0c8d118..493971d0 100644 --- a/rustfs/Cargo.toml +++ b/rustfs/Cargo.toml @@ -20,6 +20,8 @@ log.workspace = true async-trait.workspace = true bytes.workspace = true clap.workspace = true +csv = "1.3.1" +datafusion = { workspace = true } common.workspace = true ecstore.workspace = true policy.workspace =true @@ -45,7 +47,7 @@ serde.workspace = true serde_json.workspace = true tracing.workspace = true time = { workspace = true, features = ["parsing", "formatting", "serde"] } -tokio-util = { version = "0.7.13", features = ["io", "compat"] } +tokio-util.workspace = true tokio = { workspace = true, features = [ "rt-multi-thread", "macros", @@ -69,6 +71,8 @@ const-str = { version = "0.6.1", features = ["std", "proc"] } atoi = "2.0.0" serde_urlencoded = "0.7.1" crypto = { path = "../crypto" } +query = { path = "../s3select/query" } +api = { path = "../s3select/api" } iam = { path = "../iam" } jsonwebtoken = "9.3.0" tower-http = { version = "0.6.2", features = ["cors"] } diff --git a/rustfs/src/storage/ecfs.rs b/rustfs/src/storage/ecfs.rs index 51cf0ea9..1fc410cb 100644 --- a/rustfs/src/storage/ecfs.rs +++ b/rustfs/src/storage/ecfs.rs @@ -4,8 +4,14 @@ use super::options::extract_metadata; use super::options::put_opts; use crate::auth::get_condition_values; use crate::storage::access::ReqInfo; +use api::query::Context; +use api::query::Query; +use api::server::dbms::DatabaseManagerSystem; use bytes::Bytes; use common::error::Result; +use datafusion::arrow::csv::WriterBuilder as CsvWriterBuilder; +use datafusion::arrow::json::writer::JsonArray; +use datafusion::arrow::json::WriterBuilder as JsonWriterBuilder; use ecstore::bucket::error::BucketMetadataError; use ecstore::bucket::metadata::BUCKET_LIFECYCLE_CONFIG; use ecstore::bucket::metadata::BUCKET_NOTIFICATION_CONFIG; @@ -47,6 +53,7 @@ use policy::policy::action::S3Action; use policy::policy::BucketPolicy; use policy::policy::BucketPolicyArgs; use policy::policy::Validator; +use query::instance::make_rustfsms; use s3s::dto::*; use s3s::s3_error; use s3s::S3Error; @@ -56,6 +63,8 @@ use s3s::S3; use s3s::{S3Request, S3Response}; use std::fmt::Debug; use std::str::FromStr; +use tokio::sync::mpsc; +use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tokio_util::io::StreamReader; use tracing::debug; @@ -1859,6 +1868,69 @@ impl S3 for FS { } Ok(S3Response::new(PutObjectAclOutput::default())) } + + async fn select_object_content( + &self, + req: S3Request, + ) -> S3Result> { + info!("handle select_object_content"); + + let input = req.input; + info!("{:?}", input); + + let db = make_rustfsms(input.clone(), false).await.map_err(|e| { + error!("make db failed, {}", e.to_string()); + s3_error!(InternalError) + })?; + let query = Query::new(Context { input: input.clone() }, input.request.expression); + let result = db.execute(&query).await.map_err(|_| s3_error!(InternalError))?; + + let results = result.result().chunk_result().await.unwrap().to_vec(); + + let mut buffer = Vec::new(); + if input.request.output_serialization.csv.is_some() { + let mut csv_writer = CsvWriterBuilder::new().with_header(false).build(&mut buffer); + for batch in results { + csv_writer + .write(&batch) + .map_err(|e| s3_error!(InternalError, "cann't encode output to csv. e: {}", e.to_string()))?; + } + } else if input.request.output_serialization.json.is_some() { + let mut json_writer = JsonWriterBuilder::new() + .with_explicit_nulls(true) + .build::<_, JsonArray>(&mut buffer); + for batch in results { + json_writer + .write(&batch) + .map_err(|e| s3_error!(InternalError, "cann't encode output to json. e: {}", e.to_string()))?; + } + json_writer + .finish() + .map_err(|e| s3_error!(InternalError, "writer output into json error, e: {}", e.to_string()))?; + } else { + return Err(s3_error!(InvalidArgument, "unknow output format")); + } + + let (tx, rx) = mpsc::channel::>(2); + let stream = ReceiverStream::new(rx); + tokio::spawn(async move { + let _ = tx + .send(Ok(SelectObjectContentEvent::Cont(ContinuationEvent::default()))) + .await; + let _ = tx + .send(Ok(SelectObjectContentEvent::Records(RecordsEvent { + payload: Some(Bytes::from(buffer)), + }))) + .await; + let _ = tx.send(Ok(SelectObjectContentEvent::End(EndEvent::default()))).await; + + drop(tx); + }); + + Ok(S3Response::new(SelectObjectContentOutput { + payload: Some(SelectObjectContentEventStream::new(stream)), + })) + } } #[allow(dead_code)] diff --git a/s3select/api/Cargo.toml b/s3select/api/Cargo.toml new file mode 100644 index 00000000..1936e846 --- /dev/null +++ b/s3select/api/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "api" +version.workspace = true +edition.workspace = true + +[dependencies] +async-trait.workspace = true +bytes.workspace = true +chrono.workspace = true +datafusion = { workspace = true } +ecstore.workspace = true +futures = { workspace = true } +futures-core = "0.3.31" +http.workspace = true +object_store = "0.11.2" +s3s.workspace = true +snafu = { workspace = true, features = ["backtrace"] } +tokio.workspace = true +tokio-util.workspace = true +tracing.workspace = true +transform-stream.workspace = true +url.workspace = true \ No newline at end of file diff --git a/s3select/api/src/lib.rs b/s3select/api/src/lib.rs new file mode 100644 index 00000000..acaead72 --- /dev/null +++ b/s3select/api/src/lib.rs @@ -0,0 +1,77 @@ +use std::fmt::Display; + +use datafusion::{common::DataFusionError, sql::sqlparser::parser::ParserError}; +use snafu::{Backtrace, Location, Snafu}; + +pub mod object_store; +pub mod query; +pub mod server; + +pub type QueryResult = Result; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum QueryError { + Datafusion { + source: DataFusionError, + location: Location, + backtrace: Backtrace, + }, + + #[snafu(display("This feature is not implemented: {}", err))] + NotImplemented { err: String }, + + #[snafu(display("Multi-statement not allow, found num:{}, sql:{}", num, sql))] + MultiStatement { num: usize, sql: String }, + + #[snafu(display("Failed to build QueryDispatcher. err: {}", err))] + BuildQueryDispatcher { err: String }, + + #[snafu(display("The query has been canceled"))] + Cancel, + + #[snafu(display("{}", source))] + Parser { source: ParserError }, + + #[snafu(display("Udf not exists, name:{}.", name))] + FunctionNotExists { name: String }, + + #[snafu(display("Udf already exists, name:{}.", name))] + FunctionExists { name: String }, + + #[snafu(display("Store Error, e:{}.", e))] + StoreError { e: String }, +} + +impl From for QueryError { + fn from(value: DataFusionError) -> Self { + match value { + DataFusionError::External(e) if e.downcast_ref::().is_some() => *e.downcast::().unwrap(), + + v => Self::Datafusion { + source: v, + location: Default::default(), + backtrace: Backtrace::capture(), + }, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ResolvedTable { + // path + table: String, +} + +impl ResolvedTable { + pub fn table(&self) -> &str { + &self.table + } +} + +impl Display for ResolvedTable { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { table } = self; + write!(f, "{table}") + } +} diff --git a/s3select/api/src/object_store.rs b/s3select/api/src/object_store.rs new file mode 100644 index 00000000..7772936c --- /dev/null +++ b/s3select/api/src/object_store.rs @@ -0,0 +1,177 @@ +use async_trait::async_trait; +use bytes::Bytes; +use chrono::Utc; +use ecstore::io::READ_BUFFER_SIZE; +use ecstore::new_object_layer_fn; +use ecstore::store::ECStore; +use ecstore::store_api::ObjectIO; +use ecstore::store_api::ObjectOptions; +use ecstore::StorageAPI; +use futures::pin_mut; +use futures::{Stream, StreamExt}; +use futures_core::stream::BoxStream; +use http::HeaderMap; +use object_store::path::Path; +use object_store::Attributes; +use object_store::GetOptions; +use object_store::GetResult; +use object_store::ListResult; +use object_store::MultipartUpload; +use object_store::ObjectMeta; +use object_store::ObjectStore; +use object_store::PutMultipartOpts; +use object_store::PutOptions; +use object_store::PutPayload; +use object_store::PutResult; +use object_store::{Error as o_Error, Result}; +use s3s::dto::SelectObjectContentInput; +use s3s::s3_error; +use s3s::S3Result; +use std::ops::Range; +use std::sync::Arc; +use tokio_util::io::ReaderStream; +use tracing::info; +use transform_stream::AsyncTryStream; + +#[derive(Debug)] +pub struct EcObjectStore { + input: SelectObjectContentInput, + + store: Arc, +} + +impl EcObjectStore { + pub fn new(input: SelectObjectContentInput) -> S3Result { + let Some(store) = new_object_layer_fn() else { + return Err(s3_error!(InternalError, "ec store not inited")); + }; + + Ok(Self { input, store }) + } +} + +impl std::fmt::Display for EcObjectStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str("EcObjectStore") + } +} + +#[async_trait] +impl ObjectStore for EcObjectStore { + async fn put_opts(&self, _location: &Path, _payload: PutPayload, _opts: PutOptions) -> Result { + unimplemented!() + } + + async fn put_multipart_opts(&self, _location: &Path, _opts: PutMultipartOpts) -> Result> { + unimplemented!() + } + + async fn get_opts(&self, location: &Path, _options: GetOptions) -> Result { + info!("{:?}", location); + let opts = ObjectOptions::default(); + let h = HeaderMap::new(); + let reader = self + .store + .get_object_reader(&self.input.bucket, &self.input.key, None, h, &opts) + .await + .map_err(|_| o_Error::NotFound { + path: format!("{}/{}", self.input.bucket, self.input.key), + source: "can not get object info".into(), + })?; + + // let stream = stream::unfold(reader.stream, |mut blob| async move { + // match blob.next().await { + // Some(Ok(chunk)) => { + // let bytes = chunk; + // Some((Ok(bytes), blob)) + // } + // _ => None, + // } + // }) + // .boxed(); + let meta = ObjectMeta { + location: location.clone(), + last_modified: Utc::now(), + size: reader.object_info.size, + e_tag: reader.object_info.etag, + version: None, + }; + let attributes = Attributes::default(); + + Ok(GetResult { + payload: object_store::GetResultPayload::Stream( + bytes_stream(ReaderStream::with_capacity(reader.stream, READ_BUFFER_SIZE), reader.object_info.size).boxed(), + ), + meta, + range: 0..reader.object_info.size, + attributes, + }) + } + + async fn get_ranges(&self, _location: &Path, _ranges: &[Range]) -> Result> { + unimplemented!() + } + + async fn head(&self, location: &Path) -> Result { + info!("{:?}", location); + let opts = ObjectOptions::default(); + let info = self + .store + .get_object_info(&self.input.bucket, &self.input.key, &opts) + .await + .map_err(|_| o_Error::NotFound { + path: format!("{}/{}", self.input.bucket, self.input.key), + source: "can not get object info".into(), + })?; + + Ok(ObjectMeta { + location: location.clone(), + last_modified: Utc::now(), + size: info.size, + e_tag: info.etag, + version: None, + }) + } + + async fn delete(&self, _location: &Path) -> Result<()> { + unimplemented!() + } + + fn list(&self, _prefix: Option<&Path>) -> BoxStream<'_, Result> { + unimplemented!() + } + + async fn list_with_delimiter(&self, _prefix: Option<&Path>) -> Result { + unimplemented!() + } + + async fn copy(&self, _from: &Path, _to: &Path) -> Result<()> { + unimplemented!() + } + + async fn copy_if_not_exists(&self, _from: &Path, _too: &Path) -> Result<()> { + unimplemented!() + } +} + +pub fn bytes_stream(stream: S, content_length: usize) -> impl Stream> + Send + 'static +where + S: Stream> + Send + 'static, +{ + AsyncTryStream::::new(|mut y| async move { + pin_mut!(stream); + let mut remaining: usize = content_length; + while let Some(result) = stream.next().await { + let mut bytes = result.map_err(|e| o_Error::Generic { + store: "", + source: Box::new(e), + })?; + if bytes.len() > remaining { + bytes.truncate(remaining); + } + remaining -= bytes.len(); + y.yield_ok(bytes).await; + } + Ok(()) + }) +} diff --git a/s3select/api/src/query/analyzer.rs b/s3select/api/src/query/analyzer.rs new file mode 100644 index 00000000..db849566 --- /dev/null +++ b/s3select/api/src/query/analyzer.rs @@ -0,0 +1,12 @@ +use std::sync::Arc; + +use datafusion::logical_expr::LogicalPlan; + +use super::session::SessionCtx; +use crate::QueryResult; + +pub type AnalyzerRef = Arc; + +pub trait Analyzer { + fn analyze(&self, plan: &LogicalPlan, session: &SessionCtx) -> QueryResult; +} diff --git a/s3select/api/src/query/ast.rs b/s3select/api/src/query/ast.rs new file mode 100644 index 00000000..dbe9b4b2 --- /dev/null +++ b/s3select/api/src/query/ast.rs @@ -0,0 +1,8 @@ +use datafusion::sql::sqlparser::ast::Statement; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ExtStatement { + /// ANSI SQL AST node + SqlStatement(Box), + // we can expand command +} diff --git a/s3select/api/src/query/datasource/mod.rs b/s3select/api/src/query/datasource/mod.rs new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/s3select/api/src/query/datasource/mod.rs @@ -0,0 +1 @@ + diff --git a/s3select/api/src/query/dispatcher.rs b/s3select/api/src/query/dispatcher.rs new file mode 100644 index 00000000..433ddf01 --- /dev/null +++ b/s3select/api/src/query/dispatcher.rs @@ -0,0 +1,32 @@ +use std::sync::Arc; + +use async_trait::async_trait; + +use crate::QueryResult; + +use super::{ + execution::{Output, QueryStateMachine}, + logical_planner::Plan, + Query, +}; + +#[async_trait] +pub trait QueryDispatcher: Send + Sync { + // fn create_query_id(&self) -> QueryId; + + // fn query_info(&self, id: &QueryId); + + async fn execute_query(&self, query: &Query) -> QueryResult; + + async fn build_logical_plan(&self, query_state_machine: Arc) -> QueryResult>; + + async fn execute_logical_plan(&self, logical_plan: Plan, query_state_machine: Arc) -> QueryResult; + + async fn build_query_state_machine(&self, query: Query) -> QueryResult>; + + // fn running_query_infos(&self) -> Vec; + + // fn running_query_status(&self) -> Vec; + + // fn cancel_query(&self, id: &QueryId); +} diff --git a/s3select/api/src/query/execution.rs b/s3select/api/src/query/execution.rs new file mode 100644 index 00000000..10c48acc --- /dev/null +++ b/s3select/api/src/query/execution.rs @@ -0,0 +1,241 @@ +use std::fmt::Display; +use std::pin::Pin; +use std::sync::atomic::{AtomicPtr, Ordering}; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::{Duration, Instant}; + +use async_trait::async_trait; +use datafusion::arrow::datatypes::{Schema, SchemaRef}; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::physical_plan::SendableRecordBatchStream; +use futures::{Stream, StreamExt, TryStreamExt}; + +use crate::{QueryError, QueryResult}; + +use super::logical_planner::Plan; +use super::session::SessionCtx; +use super::Query; + +pub type QueryExecutionRef = Arc; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum QueryType { + Batch, + Stream, +} + +impl Display for QueryType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Batch => write!(f, "batch"), + Self::Stream => write!(f, "stream"), + } + } +} + +#[async_trait] +pub trait QueryExecution: Send + Sync { + fn query_type(&self) -> QueryType { + QueryType::Batch + } + // 开始 + async fn start(&self) -> QueryResult; + // 停止 + fn cancel(&self) -> QueryResult<()>; +} + +pub enum Output { + StreamData(SendableRecordBatchStream), + Nil(()), +} + +impl Output { + pub fn schema(&self) -> SchemaRef { + match self { + Self::StreamData(stream) => stream.schema(), + Self::Nil(_) => Arc::new(Schema::empty()), + } + } + + pub async fn chunk_result(self) -> QueryResult> { + match self { + Self::Nil(_) => Ok(vec![]), + Self::StreamData(stream) => { + let schema = stream.schema(); + let mut res: Vec = stream.try_collect::>().await?; + if res.is_empty() { + res.push(RecordBatch::new_empty(schema)); + } + Ok(res) + } + } + } + + pub async fn num_rows(self) -> usize { + match self.chunk_result().await { + Ok(rb) => rb.iter().map(|e| e.num_rows()).sum(), + Err(_) => 0, + } + } + + /// Returns the number of records affected by the query operation + /// + /// If it is a select statement, returns the number of rows in the result set + /// + /// -1 means unknown + /// + /// panic! when StreamData's number of records greater than i64::Max + pub async fn affected_rows(self) -> i64 { + self.num_rows().await as i64 + } +} + +impl Stream for Output { + type Item = std::result::Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + match this { + Output::StreamData(stream) => stream.poll_next_unpin(cx).map_err(|e| e.into()), + Output::Nil(_) => Poll::Ready(None), + } + } +} + +#[async_trait] +pub trait QueryExecutionFactory { + async fn create_query_execution( + &self, + plan: Plan, + query_state_machine: QueryStateMachineRef, + ) -> QueryResult; +} + +pub type QueryStateMachineRef = Arc; + +pub struct QueryStateMachine { + pub session: SessionCtx, + pub query: Query, + + state: AtomicPtr, + start: Instant, +} + +impl QueryStateMachine { + pub fn begin(query: Query, session: SessionCtx) -> Self { + Self { + session, + query, + state: AtomicPtr::new(Box::into_raw(Box::new(QueryState::ACCEPTING))), + start: Instant::now(), + } + } + + pub fn begin_analyze(&self) { + // TODO record time + self.translate_to(Box::new(QueryState::RUNNING(RUNNING::ANALYZING))); + } + + pub fn end_analyze(&self) { + // TODO record time + } + + pub fn begin_optimize(&self) { + // TODO record time + self.translate_to(Box::new(QueryState::RUNNING(RUNNING::OPTMIZING))); + } + + pub fn end_optimize(&self) { + // TODO + } + + pub fn begin_schedule(&self) { + // TODO + self.translate_to(Box::new(QueryState::RUNNING(RUNNING::SCHEDULING))); + } + + pub fn end_schedule(&self) { + // TODO + } + + pub fn finish(&self) { + // TODO + self.translate_to(Box::new(QueryState::DONE(DONE::FINISHED))); + } + + pub fn cancel(&self) { + // TODO + self.translate_to(Box::new(QueryState::DONE(DONE::CANCELLED))); + } + + pub fn fail(&self) { + // TODO + self.translate_to(Box::new(QueryState::DONE(DONE::FAILED))); + } + + pub fn state(&self) -> &QueryState { + unsafe { &*self.state.load(Ordering::Relaxed) } + } + + pub fn duration(&self) -> Duration { + self.start.elapsed() + } + + fn translate_to(&self, state: Box) { + self.state.store(Box::into_raw(state), Ordering::Relaxed); + } +} + +#[derive(Debug, Clone)] +pub enum QueryState { + ACCEPTING, + RUNNING(RUNNING), + DONE(DONE), +} + +impl AsRef for QueryState { + fn as_ref(&self) -> &str { + match self { + QueryState::ACCEPTING => "ACCEPTING", + QueryState::RUNNING(e) => e.as_ref(), + QueryState::DONE(e) => e.as_ref(), + } + } +} + +#[derive(Debug, Clone)] +pub enum RUNNING { + DISPATCHING, + ANALYZING, + OPTMIZING, + SCHEDULING, +} + +impl AsRef for RUNNING { + fn as_ref(&self) -> &str { + match self { + Self::DISPATCHING => "DISPATCHING", + Self::ANALYZING => "ANALYZING", + Self::OPTMIZING => "OPTMIZING", + Self::SCHEDULING => "SCHEDULING", + } + } +} + +#[derive(Debug, Clone)] +pub enum DONE { + FINISHED, + FAILED, + CANCELLED, +} + +impl AsRef for DONE { + fn as_ref(&self) -> &str { + match self { + Self::FINISHED => "FINISHED", + Self::FAILED => "FAILED", + Self::CANCELLED => "CANCELLED", + } + } +} diff --git a/s3select/api/src/query/function.rs b/s3select/api/src/query/function.rs new file mode 100644 index 00000000..af207fc1 --- /dev/null +++ b/s3select/api/src/query/function.rs @@ -0,0 +1,23 @@ +use std::collections::HashSet; +use std::sync::Arc; + +use datafusion::logical_expr::{AggregateUDF, ScalarUDF, WindowUDF}; + +use crate::QueryResult; + +pub type FuncMetaManagerRef = Arc; +pub trait FunctionMetadataManager { + fn register_udf(&mut self, udf: ScalarUDF) -> QueryResult<()>; + + fn register_udaf(&mut self, udaf: AggregateUDF) -> QueryResult<()>; + + fn register_udwf(&mut self, udwf: WindowUDF) -> QueryResult<()>; + + fn udf(&self, name: &str) -> QueryResult>; + + fn udaf(&self, name: &str) -> QueryResult>; + + fn udwf(&self, name: &str) -> QueryResult>; + + fn udfs(&self) -> HashSet; +} diff --git a/s3select/api/src/query/logical_planner.rs b/s3select/api/src/query/logical_planner.rs new file mode 100644 index 00000000..cef844b3 --- /dev/null +++ b/s3select/api/src/query/logical_planner.rs @@ -0,0 +1,40 @@ +use async_trait::async_trait; +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::logical_expr::LogicalPlan as DFPlan; + +use crate::QueryResult; + +use super::ast::ExtStatement; +use super::session::SessionCtx; + +#[derive(Clone)] +pub enum Plan { + // only support query sql + /// Query plan + Query(QueryPlan), +} + +impl Plan { + pub fn schema(&self) -> SchemaRef { + match self { + Self::Query(p) => SchemaRef::from(p.df_plan.schema().as_ref().to_owned()), + } + } +} + +#[derive(Debug, Clone)] +pub struct QueryPlan { + pub df_plan: DFPlan, + pub is_tag_scan: bool, +} + +impl QueryPlan { + pub fn is_explain(&self) -> bool { + matches!(self.df_plan, DFPlan::Explain(_) | DFPlan::Analyze(_)) + } +} + +#[async_trait] +pub trait LogicalPlanner { + async fn create_logical_plan(&self, statement: ExtStatement, session: &SessionCtx) -> QueryResult; +} diff --git a/s3select/api/src/query/mod.rs b/s3select/api/src/query/mod.rs new file mode 100644 index 00000000..6ddd2dc8 --- /dev/null +++ b/s3select/api/src/query/mod.rs @@ -0,0 +1,41 @@ +use s3s::dto::SelectObjectContentInput; + +pub mod analyzer; +pub mod ast; +pub mod datasource; +pub mod dispatcher; +pub mod execution; +pub mod function; +pub mod logical_planner; +pub mod optimizer; +pub mod parser; +pub mod physical_planner; +pub mod scheduler; +pub mod session; + +#[derive(Clone)] +pub struct Context { + // maybe we need transfer some info? + pub input: SelectObjectContentInput, +} + +#[derive(Clone)] +pub struct Query { + context: Context, + content: String, +} + +impl Query { + #[inline(always)] + pub fn new(context: Context, content: String) -> Self { + Self { context, content } + } + + pub fn context(&self) -> &Context { + &self.context + } + + pub fn content(&self) -> &str { + self.content.as_str() + } +} diff --git a/s3select/api/src/query/optimizer.rs b/s3select/api/src/query/optimizer.rs new file mode 100644 index 00000000..c2392eb9 --- /dev/null +++ b/s3select/api/src/query/optimizer.rs @@ -0,0 +1,15 @@ +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::physical_plan::ExecutionPlan; + +use super::logical_planner::QueryPlan; +use super::session::SessionCtx; +use crate::QueryResult; + +pub type OptimizerRef = Arc; + +#[async_trait] +pub trait Optimizer { + async fn optimize(&self, plan: &QueryPlan, session: &SessionCtx) -> QueryResult>; +} diff --git a/s3select/api/src/query/parser.rs b/s3select/api/src/query/parser.rs new file mode 100644 index 00000000..76d7e723 --- /dev/null +++ b/s3select/api/src/query/parser.rs @@ -0,0 +1,8 @@ +use std::collections::VecDeque; + +use super::ast::ExtStatement; +use crate::QueryResult; + +pub trait Parser { + fn parse(&self, sql: &str) -> QueryResult>; +} diff --git a/s3select/api/src/query/physical_planner.rs b/s3select/api/src/query/physical_planner.rs new file mode 100644 index 00000000..c71787e9 --- /dev/null +++ b/s3select/api/src/query/physical_planner.rs @@ -0,0 +1,21 @@ +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::logical_expr::LogicalPlan; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_planner::ExtensionPlanner; + +use super::session::SessionCtx; +use crate::QueryResult; + +#[async_trait] +pub trait PhysicalPlanner { + /// Given a `LogicalPlan`, create an `ExecutionPlan` suitable for execution + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + session_state: &SessionCtx, + ) -> QueryResult>; + + fn inject_physical_transform_rule(&mut self, rule: Arc); +} diff --git a/s3select/api/src/query/scheduler.rs b/s3select/api/src/query/scheduler.rs new file mode 100644 index 00000000..3dd49c22 --- /dev/null +++ b/s3select/api/src/query/scheduler.rs @@ -0,0 +1,32 @@ +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::common::Result; +use datafusion::execution::context::TaskContext; +use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream}; + +pub type SchedulerRef = Arc; + +#[async_trait] +pub trait Scheduler { + /// Schedule the provided [`ExecutionPlan`] on this [`Scheduler`]. + /// + /// Returns a [`ExecutionResults`] that can be used to receive results as they are produced, + /// as a [`futures::Stream`] of [`RecordBatch`] + async fn schedule(&self, plan: Arc, context: Arc) -> Result; +} + +pub struct ExecutionResults { + stream: SendableRecordBatchStream, +} + +impl ExecutionResults { + pub fn new(stream: SendableRecordBatchStream) -> Self { + Self { stream } + } + + /// Returns a [`SendableRecordBatchStream`] of this execution + pub fn stream(self) -> SendableRecordBatchStream { + self.stream + } +} diff --git a/s3select/api/src/query/session.rs b/s3select/api/src/query/session.rs new file mode 100644 index 00000000..c9d91f51 --- /dev/null +++ b/s3select/api/src/query/session.rs @@ -0,0 +1,86 @@ +use std::sync::Arc; + +use bytes::Bytes; +use datafusion::{ + execution::{context::SessionState, runtime_env::RuntimeEnvBuilder, SessionStateBuilder}, + prelude::SessionContext, +}; +use object_store::{memory::InMemory, path::Path, ObjectStore}; +use tracing::error; + +use crate::{object_store::EcObjectStore, QueryError, QueryResult}; + +use super::Context; + +#[derive(Clone)] +pub struct SessionCtx { + _desc: Arc, + inner: SessionState, +} + +impl SessionCtx { + pub fn inner(&self) -> &SessionState { + &self.inner + } +} + +#[derive(Clone)] +pub struct SessionCtxDesc { + // maybe we need some info +} + +#[derive(Default)] +pub struct SessionCtxFactory { + pub is_test: bool, +} + +impl SessionCtxFactory { + pub async fn create_session_ctx(&self, context: &Context) -> QueryResult { + let df_session_ctx = self.build_df_session_context(context).await?; + + Ok(SessionCtx { + _desc: Arc::new(SessionCtxDesc {}), + inner: df_session_ctx.state(), + }) + } + + async fn build_df_session_context(&self, context: &Context) -> QueryResult { + let path = format!("s3://{}", context.input.bucket); + let store_url = url::Url::parse(&path).unwrap(); + let rt = RuntimeEnvBuilder::new().build()?; + let df_session_state = SessionStateBuilder::new() + .with_runtime_env(Arc::new(rt)) + .with_default_features(); + + let df_session_state = if self.is_test { + let store: Arc = Arc::new(InMemory::new()); + let data = b"id,name,age,department,salary + 1,Alice,25,HR,5000 + 2,Bob,30,IT,6000 + 3,Charlie,35,Finance,7000 + 4,Diana,22,Marketing,4500 + 5,Eve,28,IT,5500 + 6,Frank,40,Finance,8000 + 7,Grace,26,HR,5200 + 8,Henry,32,IT,6200 + 9,Ivy,24,Marketing,4800 + 10,Jack,38,Finance,7500"; + let data_bytes = Bytes::from(data.to_vec()); + let path = Path::from(context.input.key.clone()); + store.put(&path, data_bytes.into()).await.map_err(|e| { + error!("put data into memory failed: {}", e.to_string()); + QueryError::StoreError { e: e.to_string() } + })?; + + df_session_state.with_object_store(&store_url, Arc::new(store)).build() + } else { + let store = + EcObjectStore::new(context.input.clone()).map_err(|_| QueryError::NotImplemented { err: String::new() })?; + df_session_state.with_object_store(&store_url, Arc::new(store)).build() + }; + + let df_session_ctx = SessionContext::new_with_state(df_session_state); + + Ok(df_session_ctx) + } +} diff --git a/s3select/api/src/server/dbms.rs b/s3select/api/src/server/dbms.rs new file mode 100644 index 00000000..85d32055 --- /dev/null +++ b/s3select/api/src/server/dbms.rs @@ -0,0 +1,41 @@ +use async_trait::async_trait; + +use crate::{ + query::{ + execution::{Output, QueryStateMachineRef}, + logical_planner::Plan, + Query, + }, + QueryResult, +}; + +pub struct QueryHandle { + query: Query, + result: Output, +} + +impl QueryHandle { + pub fn new(query: Query, result: Output) -> Self { + Self { query, result } + } + + pub fn query(&self) -> &Query { + &self.query + } + + pub fn result(self) -> Output { + self.result + } +} + +#[async_trait] +pub trait DatabaseManagerSystem { + async fn execute(&self, query: &Query) -> QueryResult; + async fn build_query_state_machine(&self, query: Query) -> QueryResult; + async fn build_logical_plan(&self, query_state_machine: QueryStateMachineRef) -> QueryResult>; + async fn execute_logical_plan( + &self, + logical_plan: Plan, + query_state_machine: QueryStateMachineRef, + ) -> QueryResult; +} diff --git a/s3select/api/src/server/mod.rs b/s3select/api/src/server/mod.rs new file mode 100644 index 00000000..c2e7c7b5 --- /dev/null +++ b/s3select/api/src/server/mod.rs @@ -0,0 +1 @@ +pub mod dbms; diff --git a/s3select/query/Cargo.toml b/s3select/query/Cargo.toml new file mode 100644 index 00000000..61b0b07b --- /dev/null +++ b/s3select/query/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "query" +version.workspace = true +edition.workspace = true + +[dependencies] +api = { path = "../api" } +async-recursion = { workspace = true } +async-trait.workspace = true +datafusion = { workspace = true } +derive_builder = { workspace = true } +futures = { workspace = true } +parking_lot = { version = "0.12.1" } +s3s.workspace = true +snafu = { workspace = true, features = ["backtrace"] } +tokio = { workspace = true } +tracing = { workspace = true } \ No newline at end of file diff --git a/s3select/query/src/data_source/mod.rs b/s3select/query/src/data_source/mod.rs new file mode 100644 index 00000000..b0704130 --- /dev/null +++ b/s3select/query/src/data_source/mod.rs @@ -0,0 +1 @@ +pub mod table_source; diff --git a/s3select/query/src/data_source/table_source.rs b/s3select/query/src/data_source/table_source.rs new file mode 100644 index 00000000..77df6e81 --- /dev/null +++ b/s3select/query/src/data_source/table_source.rs @@ -0,0 +1,138 @@ +use std::any::Any; +use std::borrow::Cow; +use std::fmt::Display; +use std::sync::Arc; +use std::write; + +use async_trait::async_trait; +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::common::Result as DFResult; +use datafusion::datasource::listing::ListingTable; +use datafusion::datasource::{provider_as_source, TableProvider}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::{LogicalPlan, LogicalPlanBuilder, TableProviderFilterPushDown, TableSource}; +use datafusion::prelude::Expr; +use datafusion::sql::TableReference; +use tracing::debug; + +pub const TEMP_LOCATION_TABLE_NAME: &str = "external_location_table"; + +pub struct TableSourceAdapter { + database_name: String, + table_name: String, + table_handle: TableHandle, + + plan: LogicalPlan, +} + +impl TableSourceAdapter { + pub fn try_new( + table_ref: impl Into, + table_name: impl Into, + table_handle: impl Into, + ) -> Result { + let table_name: String = table_name.into(); + + let table_handle = table_handle.into(); + let plan = match &table_handle { + // TableScan + TableHandle::External(t) => { + let table_source = provider_as_source(t.clone()); + LogicalPlanBuilder::scan(table_ref, table_source, None)?.build()? + } + // TableScan + TableHandle::TableProvider(t) => { + let table_source = provider_as_source(t.clone()); + if let Some(plan) = table_source.get_logical_plan() { + LogicalPlanBuilder::from(plan.into_owned()).build()? + } else { + LogicalPlanBuilder::scan(table_ref, table_source, None)?.build()? + } + } + }; + + debug!("Table source logical plan node of {}:\n{}", table_name, plan.display_indent_schema()); + + Ok(Self { + database_name: "default_db".to_string(), + table_name, + table_handle, + plan, + }) + } + + pub fn database_name(&self) -> &str { + &self.database_name + } + + pub fn table_name(&self) -> &str { + &self.table_name + } + + pub fn table_handle(&self) -> &TableHandle { + &self.table_handle + } +} + +#[async_trait] +impl TableSource for TableSourceAdapter { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.table_handle.schema() + } + + fn supports_filters_pushdown(&self, filter: &[&Expr]) -> DFResult> { + self.table_handle.supports_filters_pushdown(filter) + } + + /// Called by [`InlineTableScan`] + fn get_logical_plan(&self) -> Option> { + Some(Cow::Owned(self.plan.clone())) + } +} + +#[derive(Clone)] +pub enum TableHandle { + TableProvider(Arc), + External(Arc), +} + +impl TableHandle { + pub fn schema(&self) -> SchemaRef { + match self { + Self::External(t) => t.schema(), + Self::TableProvider(t) => t.schema(), + } + } + + pub fn supports_filters_pushdown(&self, filter: &[&Expr]) -> DFResult> { + match self { + Self::External(t) => t.supports_filters_pushdown(filter), + Self::TableProvider(t) => t.supports_filters_pushdown(filter), + } + } +} + +impl From> for TableHandle { + fn from(value: Arc) -> Self { + TableHandle::TableProvider(value) + } +} + +impl From> for TableHandle { + fn from(value: Arc) -> Self { + TableHandle::External(value) + } +} + +impl Display for TableHandle { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::External(e) => write!(f, "External({:?})", e.table_paths()), + Self::TableProvider(_) => write!(f, "TableProvider"), + } + } +} diff --git a/s3select/query/src/dispatcher/manager.rs b/s3select/query/src/dispatcher/manager.rs new file mode 100644 index 00000000..4abc4cec --- /dev/null +++ b/s3select/query/src/dispatcher/manager.rs @@ -0,0 +1,271 @@ +use std::{ + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use api::{ + query::{ + ast::ExtStatement, + dispatcher::QueryDispatcher, + execution::{Output, QueryStateMachine}, + function::FuncMetaManagerRef, + logical_planner::{LogicalPlanner, Plan}, + parser::Parser, + session::{SessionCtx, SessionCtxFactory}, + Query, + }, + QueryError, QueryResult, +}; +use async_trait::async_trait; +use datafusion::{ + arrow::{datatypes::SchemaRef, record_batch::RecordBatch}, + config::CsvOptions, + datasource::{ + file_format::{csv::CsvFormat, json::JsonFormat, parquet::ParquetFormat}, + listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl}, + }, + error::Result as DFResult, + execution::{RecordBatchStream, SendableRecordBatchStream}, +}; +use futures::{Stream, StreamExt}; +use s3s::dto::SelectObjectContentInput; + +use crate::{ + execution::factory::QueryExecutionFactoryRef, + metadata::{base_table::BaseTableProvider, ContextProviderExtension, MetadataProvider, TableHandleProviderRef}, + sql::logical::planner::DefaultLogicalPlanner, +}; + +#[derive(Clone)] +pub struct SimpleQueryDispatcher { + input: SelectObjectContentInput, + // client for default tenant + _default_table_provider: TableHandleProviderRef, + session_factory: Arc, + // parser + parser: Arc, + // get query execution factory + query_execution_factory: QueryExecutionFactoryRef, + func_manager: FuncMetaManagerRef, +} + +#[async_trait] +impl QueryDispatcher for SimpleQueryDispatcher { + async fn execute_query(&self, query: &Query) -> QueryResult { + let query_state_machine = { self.build_query_state_machine(query.clone()).await? }; + + let logical_plan = self.build_logical_plan(query_state_machine.clone()).await?; + let logical_plan = match logical_plan { + Some(plan) => plan, + None => return Ok(Output::Nil(())), + }; + let result = self.execute_logical_plan(logical_plan, query_state_machine).await?; + Ok(result) + } + + async fn build_logical_plan(&self, query_state_machine: Arc) -> QueryResult> { + let session = &query_state_machine.session; + let query = &query_state_machine.query; + + let scheme_provider = self.build_scheme_provider(session).await?; + + let logical_planner = DefaultLogicalPlanner::new(&scheme_provider); + + let statements = self.parser.parse(query.content())?; + + // not allow multi statement + if statements.len() > 1 { + return Err(QueryError::MultiStatement { + num: statements.len(), + sql: query_state_machine.query.content().to_string(), + }); + } + + let stmt = match statements.front() { + Some(stmt) => stmt.clone(), + None => return Ok(None), + }; + + let logical_plan = self + .statement_to_logical_plan(stmt, &logical_planner, query_state_machine) + .await?; + Ok(Some(logical_plan)) + } + + async fn execute_logical_plan(&self, logical_plan: Plan, query_state_machine: Arc) -> QueryResult { + self.execute_logical_plan(logical_plan, query_state_machine).await + } + + async fn build_query_state_machine(&self, query: Query) -> QueryResult> { + let session = self.session_factory.create_session_ctx(query.context()).await?; + + let query_state_machine = Arc::new(QueryStateMachine::begin(query, session)); + Ok(query_state_machine) + } +} + +impl SimpleQueryDispatcher { + async fn statement_to_logical_plan( + &self, + stmt: ExtStatement, + logical_planner: &DefaultLogicalPlanner<'_, S>, + query_state_machine: Arc, + ) -> QueryResult { + // begin analyze + query_state_machine.begin_analyze(); + let logical_plan = logical_planner + .create_logical_plan(stmt, &query_state_machine.session) + .await?; + query_state_machine.end_analyze(); + + Ok(logical_plan) + } + + async fn execute_logical_plan(&self, logical_plan: Plan, query_state_machine: Arc) -> QueryResult { + let execution = self + .query_execution_factory + .create_query_execution(logical_plan, query_state_machine.clone()) + .await?; + + match execution.start().await { + Ok(Output::StreamData(stream)) => Ok(Output::StreamData(Box::pin(TrackedRecordBatchStream { inner: stream }))), + Ok(nil @ Output::Nil(_)) => Ok(nil), + Err(err) => Err(err), + } + } + + async fn build_scheme_provider(&self, session: &SessionCtx) -> QueryResult { + let path = format!("s3://{}/{}", self.input.bucket, self.input.key); + let table_path = ListingTableUrl::parse(path)?; + let listing_options = if self.input.request.input_serialization.csv.is_some() { + let file_format = CsvFormat::default().with_options(CsvOptions::default().with_has_header(false)); + ListingOptions::new(Arc::new(file_format)).with_file_extension(".csv") + } else if self.input.request.input_serialization.parquet.is_some() { + let file_format = ParquetFormat::new(); + ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet") + } else if self.input.request.input_serialization.json.is_some() { + let file_format = JsonFormat::default(); + ListingOptions::new(Arc::new(file_format)).with_file_extension(".json") + } else { + return Err(QueryError::NotImplemented { + err: "not support this file type".to_string(), + }); + }; + + let resolve_schema = listing_options.infer_schema(session.inner(), &table_path).await?; + let config = ListingTableConfig::new(table_path) + .with_listing_options(listing_options) + .with_schema(resolve_schema); + let provider = Arc::new(ListingTable::try_new(config)?); + let current_session_table_provider = self.build_table_handle_provider()?; + let metadata_provider = + MetadataProvider::new(provider, current_session_table_provider, self.func_manager.clone(), session.clone()); + + Ok(metadata_provider) + } + + fn build_table_handle_provider(&self) -> QueryResult { + let current_session_table_provider: Arc = Arc::new(BaseTableProvider::default()); + + Ok(current_session_table_provider) + } +} + +pub struct TrackedRecordBatchStream { + inner: SendableRecordBatchStream, +} + +impl RecordBatchStream for TrackedRecordBatchStream { + fn schema(&self) -> SchemaRef { + self.inner.schema() + } +} + +impl Stream for TrackedRecordBatchStream { + type Item = DFResult; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.inner.poll_next_unpin(cx) + } +} + +#[derive(Default, Clone)] +pub struct SimpleQueryDispatcherBuilder { + input: Option, + default_table_provider: Option, + session_factory: Option>, + parser: Option>, + + query_execution_factory: Option, + + func_manager: Option, +} + +impl SimpleQueryDispatcherBuilder { + pub fn with_input(mut self, input: SelectObjectContentInput) -> Self { + self.input = Some(input); + self + } + pub fn with_default_table_provider(mut self, default_table_provider: TableHandleProviderRef) -> Self { + self.default_table_provider = Some(default_table_provider); + self + } + + pub fn with_session_factory(mut self, session_factory: Arc) -> Self { + self.session_factory = Some(session_factory); + self + } + + pub fn with_parser(mut self, parser: Arc) -> Self { + self.parser = Some(parser); + self + } + + pub fn with_query_execution_factory(mut self, query_execution_factory: QueryExecutionFactoryRef) -> Self { + self.query_execution_factory = Some(query_execution_factory); + self + } + + pub fn with_func_manager(mut self, func_manager: FuncMetaManagerRef) -> Self { + self.func_manager = Some(func_manager); + self + } + + pub fn build(self) -> QueryResult> { + let input = self.input.ok_or_else(|| QueryError::BuildQueryDispatcher { + err: "lost of input".to_string(), + })?; + + let session_factory = self.session_factory.ok_or_else(|| QueryError::BuildQueryDispatcher { + err: "lost of session_factory".to_string(), + })?; + + let parser = self.parser.ok_or_else(|| QueryError::BuildQueryDispatcher { + err: "lost of parser".to_string(), + })?; + + let query_execution_factory = self.query_execution_factory.ok_or_else(|| QueryError::BuildQueryDispatcher { + err: "lost of query_execution_factory".to_string(), + })?; + + let func_manager = self.func_manager.ok_or_else(|| QueryError::BuildQueryDispatcher { + err: "lost of func_manager".to_string(), + })?; + + let default_table_provider = self.default_table_provider.ok_or_else(|| QueryError::BuildQueryDispatcher { + err: "lost of default_table_provider".to_string(), + })?; + + let dispatcher = Arc::new(SimpleQueryDispatcher { + input, + _default_table_provider: default_table_provider, + session_factory, + parser, + query_execution_factory, + func_manager, + }); + + Ok(dispatcher) + } +} diff --git a/s3select/query/src/dispatcher/mod.rs b/s3select/query/src/dispatcher/mod.rs new file mode 100644 index 00000000..ff8de9eb --- /dev/null +++ b/s3select/query/src/dispatcher/mod.rs @@ -0,0 +1 @@ +pub mod manager; diff --git a/s3select/query/src/execution/factory.rs b/s3select/query/src/execution/factory.rs new file mode 100644 index 00000000..9960d68a --- /dev/null +++ b/s3select/query/src/execution/factory.rs @@ -0,0 +1,46 @@ +use std::sync::Arc; + +use api::{ + query::{ + execution::{QueryExecutionFactory, QueryExecutionRef, QueryStateMachineRef}, + logical_planner::Plan, + optimizer::Optimizer, + scheduler::SchedulerRef, + }, + QueryError, +}; +use async_trait::async_trait; + +use super::query::SqlQueryExecution; + +pub type QueryExecutionFactoryRef = Arc; + +pub struct SqlQueryExecutionFactory { + optimizer: Arc, + scheduler: SchedulerRef, +} + +impl SqlQueryExecutionFactory { + #[inline(always)] + pub fn new(optimizer: Arc, scheduler: SchedulerRef) -> Self { + Self { optimizer, scheduler } + } +} + +#[async_trait] +impl QueryExecutionFactory for SqlQueryExecutionFactory { + async fn create_query_execution( + &self, + plan: Plan, + state_machine: QueryStateMachineRef, + ) -> Result { + match plan { + Plan::Query(query_plan) => Ok(Arc::new(SqlQueryExecution::new( + state_machine, + query_plan, + self.optimizer.clone(), + self.scheduler.clone(), + ))), + } + } +} diff --git a/s3select/query/src/execution/mod.rs b/s3select/query/src/execution/mod.rs new file mode 100644 index 00000000..807faf3e --- /dev/null +++ b/s3select/query/src/execution/mod.rs @@ -0,0 +1,3 @@ +pub mod factory; +pub mod query; +pub mod scheduler; diff --git a/s3select/query/src/execution/query.rs b/s3select/query/src/execution/query.rs new file mode 100644 index 00000000..15d6ef83 --- /dev/null +++ b/s3select/query/src/execution/query.rs @@ -0,0 +1,92 @@ +use std::sync::Arc; + +use api::query::execution::{Output, QueryExecution, QueryStateMachineRef}; +use api::query::logical_planner::QueryPlan; +use api::query::optimizer::Optimizer; +use api::query::scheduler::SchedulerRef; +use api::{QueryError, QueryResult}; +use async_trait::async_trait; +use futures::stream::AbortHandle; +use parking_lot::Mutex; +use tracing::debug; + +pub struct SqlQueryExecution { + query_state_machine: QueryStateMachineRef, + plan: QueryPlan, + optimizer: Arc, + scheduler: SchedulerRef, + + abort_handle: Mutex>, +} + +impl SqlQueryExecution { + pub fn new( + query_state_machine: QueryStateMachineRef, + plan: QueryPlan, + optimizer: Arc, + scheduler: SchedulerRef, + ) -> Self { + Self { + query_state_machine, + plan, + optimizer, + scheduler, + abort_handle: Mutex::new(None), + } + } + + async fn start(&self) -> QueryResult { + // begin optimize + self.query_state_machine.begin_optimize(); + let physical_plan = self.optimizer.optimize(&self.plan, &self.query_state_machine.session).await?; + self.query_state_machine.end_optimize(); + + // begin schedule + self.query_state_machine.begin_schedule(); + let stream = self + .scheduler + .schedule(physical_plan.clone(), self.query_state_machine.session.inner().task_ctx()) + .await? + .stream(); + + debug!("Success build result stream."); + self.query_state_machine.end_schedule(); + + Ok(Output::StreamData(stream)) + } +} + +#[async_trait] +impl QueryExecution for SqlQueryExecution { + async fn start(&self) -> QueryResult { + let (task, abort_handle) = futures::future::abortable(self.start()); + + { + *self.abort_handle.lock() = Some(abort_handle); + } + + task.await.map_err(|_| QueryError::Cancel)? + } + + fn cancel(&self) -> QueryResult<()> { + debug!( + "cancel sql query execution: sql: {}, state: {:?}", + self.query_state_machine.query.content(), + self.query_state_machine.state() + ); + + // change state + self.query_state_machine.cancel(); + // stop future task + if let Some(e) = self.abort_handle.lock().as_ref() { + e.abort() + }; + + debug!( + "canceled sql query execution: sql: {}, state: {:?}", + self.query_state_machine.query.content(), + self.query_state_machine.state() + ); + Ok(()) + } +} diff --git a/s3select/query/src/execution/scheduler/local.rs b/s3select/query/src/execution/scheduler/local.rs new file mode 100644 index 00000000..e105d4b9 --- /dev/null +++ b/s3select/query/src/execution/scheduler/local.rs @@ -0,0 +1,22 @@ +use std::sync::Arc; + +use api::query::scheduler::{ExecutionResults, Scheduler}; +use async_trait::async_trait; +use datafusion::error::DataFusionError; +use datafusion::execution::context::TaskContext; +use datafusion::physical_plan::{execute_stream, ExecutionPlan}; + +pub struct LocalScheduler {} + +#[async_trait] +impl Scheduler for LocalScheduler { + async fn schedule( + &self, + plan: Arc, + context: Arc, + ) -> Result { + let stream = execute_stream(plan, context)?; + + Ok(ExecutionResults::new(stream)) + } +} diff --git a/s3select/query/src/execution/scheduler/mod.rs b/s3select/query/src/execution/scheduler/mod.rs new file mode 100644 index 00000000..27099624 --- /dev/null +++ b/s3select/query/src/execution/scheduler/mod.rs @@ -0,0 +1 @@ +pub mod local; diff --git a/s3select/query/src/function/mod.rs b/s3select/query/src/function/mod.rs new file mode 100644 index 00000000..e76614a0 --- /dev/null +++ b/s3select/query/src/function/mod.rs @@ -0,0 +1 @@ +pub mod simple_func_manager; diff --git a/s3select/query/src/function/simple_func_manager.rs b/s3select/query/src/function/simple_func_manager.rs new file mode 100644 index 00000000..129efacf --- /dev/null +++ b/s3select/query/src/function/simple_func_manager.rs @@ -0,0 +1,63 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use api::query::function::FunctionMetadataManager; +use api::{QueryError, QueryResult}; +use datafusion::logical_expr::{AggregateUDF, ScalarUDF, WindowUDF}; + +pub type SimpleFunctionMetadataManagerRef = Arc; + +#[derive(Debug, Default)] +pub struct SimpleFunctionMetadataManager { + /// Scalar functions that are registered with the context + pub scalar_functions: HashMap>, + /// Aggregate functions registered in the context + pub aggregate_functions: HashMap>, + /// Window functions registered in the context + pub window_functions: HashMap>, +} + +impl FunctionMetadataManager for SimpleFunctionMetadataManager { + fn register_udf(&mut self, f: ScalarUDF) -> QueryResult<()> { + self.scalar_functions.insert(f.inner().name().to_uppercase(), Arc::new(f)); + Ok(()) + } + + fn register_udaf(&mut self, f: AggregateUDF) -> QueryResult<()> { + self.aggregate_functions.insert(f.inner().name().to_uppercase(), Arc::new(f)); + Ok(()) + } + + fn register_udwf(&mut self, f: WindowUDF) -> QueryResult<()> { + self.window_functions.insert(f.inner().name().to_uppercase(), Arc::new(f)); + Ok(()) + } + + fn udf(&self, name: &str) -> QueryResult> { + let result = self.scalar_functions.get(&name.to_uppercase()); + + result + .cloned() + .ok_or_else(|| QueryError::FunctionExists { name: name.to_string() }) + } + + fn udaf(&self, name: &str) -> QueryResult> { + let result = self.aggregate_functions.get(&name.to_uppercase()); + + result + .cloned() + .ok_or_else(|| QueryError::FunctionNotExists { name: name.to_string() }) + } + + fn udwf(&self, name: &str) -> QueryResult> { + let result = self.window_functions.get(&name.to_uppercase()); + + result + .cloned() + .ok_or_else(|| QueryError::FunctionNotExists { name: name.to_string() }) + } + + fn udfs(&self) -> HashSet { + self.scalar_functions.keys().cloned().collect() + } +} diff --git a/s3select/query/src/instance.rs b/s3select/query/src/instance.rs new file mode 100644 index 00000000..03cc7b03 --- /dev/null +++ b/s3select/query/src/instance.rs @@ -0,0 +1,164 @@ +use std::sync::Arc; + +use api::{ + query::{ + dispatcher::QueryDispatcher, execution::QueryStateMachineRef, logical_planner::Plan, session::SessionCtxFactory, Query, + }, + server::dbms::{DatabaseManagerSystem, QueryHandle}, + QueryResult, +}; +use async_trait::async_trait; +use derive_builder::Builder; +use s3s::dto::SelectObjectContentInput; + +use crate::{ + dispatcher::manager::SimpleQueryDispatcherBuilder, + execution::{factory::SqlQueryExecutionFactory, scheduler::local::LocalScheduler}, + function::simple_func_manager::SimpleFunctionMetadataManager, + metadata::base_table::BaseTableProvider, + sql::{optimizer::CascadeOptimizerBuilder, parser::DefaultParser}, +}; + +#[derive(Builder)] +pub struct RustFSms { + // query dispatcher & query execution + query_dispatcher: Arc, +} + +#[async_trait] +impl DatabaseManagerSystem for RustFSms +where + D: QueryDispatcher, +{ + async fn execute(&self, query: &Query) -> QueryResult { + let result = self.query_dispatcher.execute_query(query).await?; + + Ok(QueryHandle::new(query.clone(), result)) + } + + async fn build_query_state_machine(&self, query: Query) -> QueryResult { + let query_state_machine = self.query_dispatcher.build_query_state_machine(query).await?; + + Ok(query_state_machine) + } + + async fn build_logical_plan(&self, query_state_machine: QueryStateMachineRef) -> QueryResult> { + let logical_plan = self.query_dispatcher.build_logical_plan(query_state_machine).await?; + + Ok(logical_plan) + } + + async fn execute_logical_plan( + &self, + logical_plan: Plan, + query_state_machine: QueryStateMachineRef, + ) -> QueryResult { + let query = query_state_machine.query.clone(); + let result = self + .query_dispatcher + .execute_logical_plan(logical_plan, query_state_machine) + .await?; + + Ok(QueryHandle::new(query.clone(), result)) + } +} + +pub async fn make_rustfsms(input: SelectObjectContentInput, is_test: bool) -> QueryResult { + // init Function Manager, we can define some UDF if need + let func_manager = SimpleFunctionMetadataManager::default(); + // TODO session config need load global system config + let session_factory = Arc::new(SessionCtxFactory { is_test }); + let parser = Arc::new(DefaultParser::default()); + let optimizer = Arc::new(CascadeOptimizerBuilder::default().build()); + // TODO wrap, and num_threads configurable + let scheduler = Arc::new(LocalScheduler {}); + + let query_execution_factory = Arc::new(SqlQueryExecutionFactory::new(optimizer, scheduler)); + + let default_table_provider = Arc::new(BaseTableProvider::default()); + + let query_dispatcher = SimpleQueryDispatcherBuilder::default() + .with_input(input) + .with_func_manager(Arc::new(func_manager)) + .with_default_table_provider(default_table_provider) + .with_session_factory(session_factory) + .with_parser(parser) + .with_query_execution_factory(query_execution_factory) + .build()?; + + let mut builder = RustFSmsBuilder::default(); + + let db_server = builder.query_dispatcher(query_dispatcher).build().expect("build db server"); + + Ok(db_server) +} + +#[cfg(test)] +mod tests { + use api::{ + query::{Context, Query}, + server::dbms::DatabaseManagerSystem, + }; + use datafusion::{arrow::util::pretty, assert_batches_eq}; + use s3s::dto::{ + CSVInput, CSVOutput, ExpressionType, InputSerialization, OutputSerialization, SelectObjectContentInput, + SelectObjectContentRequest, + }; + + use crate::instance::make_rustfsms; + + #[tokio::test] + #[ignore] + async fn test_simple_sql() { + let sql = "select * from S3Object"; + let input = SelectObjectContentInput { + bucket: "dandan".to_string(), + expected_bucket_owner: None, + key: "test.csv".to_string(), + sse_customer_algorithm: None, + sse_customer_key: None, + sse_customer_key_md5: None, + request: SelectObjectContentRequest { + expression: sql.to_string(), + expression_type: ExpressionType::from_static("SQL"), + input_serialization: InputSerialization { + csv: Some(CSVInput::default()), + ..Default::default() + }, + output_serialization: OutputSerialization { + csv: Some(CSVOutput::default()), + ..Default::default() + }, + request_progress: None, + scan_range: None, + }, + }; + let db = make_rustfsms(input.clone(), true).await.unwrap(); + let query = Query::new(Context { input }, sql.to_string()); + + let result = db.execute(&query).await.unwrap(); + + let results = result.result().chunk_result().await.unwrap().to_vec(); + + let expected = [ + "+----------------+----------+----------+------------+----------+", + "| column_1 | column_2 | column_3 | column_4 | column_5 |", + "+----------------+----------+----------+------------+----------+", + "| id | name | age | department | salary |", + "| 1 | Alice | 25 | HR | 5000 |", + "| 2 | Bob | 30 | IT | 6000 |", + "| 3 | Charlie | 35 | Finance | 7000 |", + "| 4 | Diana | 22 | Marketing | 4500 |", + "| 5 | Eve | 28 | IT | 5500 |", + "| 6 | Frank | 40 | Finance | 8000 |", + "| 7 | Grace | 26 | HR | 5200 |", + "| 8 | Henry | 32 | IT | 6200 |", + "| 9 | Ivy | 24 | Marketing | 4800 |", + "| 10 | Jack | 38 | Finance | 7500 |", + "+----------------+----------+----------+------------+----------+", + ]; + + assert_batches_eq!(expected, &results); + pretty::print_batches(&results).unwrap(); + } +} diff --git a/s3select/query/src/lib.rs b/s3select/query/src/lib.rs new file mode 100644 index 00000000..0a0c12eb --- /dev/null +++ b/s3select/query/src/lib.rs @@ -0,0 +1,7 @@ +pub mod data_source; +pub mod dispatcher; +pub mod execution; +pub mod function; +pub mod instance; +pub mod metadata; +pub mod sql; diff --git a/s3select/query/src/metadata/base_table.rs b/s3select/query/src/metadata/base_table.rs new file mode 100644 index 00000000..38dc841a --- /dev/null +++ b/s3select/query/src/metadata/base_table.rs @@ -0,0 +1,17 @@ +use std::sync::Arc; + +use datafusion::common::Result as DFResult; +use datafusion::datasource::listing::ListingTable; + +use crate::data_source::table_source::TableHandle; + +use super::TableHandleProvider; + +#[derive(Default)] +pub struct BaseTableProvider {} + +impl TableHandleProvider for BaseTableProvider { + fn build_table_handle(&self, provider: Arc) -> DFResult { + Ok(TableHandle::External(provider)) + } +} diff --git a/s3select/query/src/metadata/mod.rs b/s3select/query/src/metadata/mod.rs new file mode 100644 index 00000000..78c79e36 --- /dev/null +++ b/s3select/query/src/metadata/mod.rs @@ -0,0 +1,126 @@ +use std::sync::Arc; + +use api::query::{function::FuncMetaManagerRef, session::SessionCtx}; +use async_trait::async_trait; +use datafusion::arrow::datatypes::DataType; +use datafusion::common::Result as DFResult; +use datafusion::datasource::listing::ListingTable; +use datafusion::logical_expr::var_provider::is_system_variables; +use datafusion::logical_expr::{AggregateUDF, ScalarUDF, TableSource, WindowUDF}; +use datafusion::variable::VarType; +use datafusion::{ + config::ConfigOptions, + sql::{planner::ContextProvider, TableReference}, +}; + +use crate::data_source::table_source::{TableHandle, TableSourceAdapter}; + +pub mod base_table; + +#[async_trait] +pub trait ContextProviderExtension: ContextProvider { + fn get_table_source_(&self, name: TableReference) -> datafusion::common::Result>; +} + +pub type TableHandleProviderRef = Arc; + +pub trait TableHandleProvider { + fn build_table_handle(&self, provider: Arc) -> DFResult; +} + +pub struct MetadataProvider { + provider: Arc, + session: SessionCtx, + config_options: ConfigOptions, + func_manager: FuncMetaManagerRef, + current_session_table_provider: TableHandleProviderRef, +} + +impl MetadataProvider { + #[allow(clippy::too_many_arguments)] + pub fn new( + provider: Arc, + current_session_table_provider: TableHandleProviderRef, + func_manager: FuncMetaManagerRef, + session: SessionCtx, + ) -> Self { + Self { + provider, + current_session_table_provider, + config_options: session.inner().config_options().clone(), + session, + func_manager, + } + } + + fn build_table_handle(&self) -> datafusion::common::Result { + self.current_session_table_provider.build_table_handle(self.provider.clone()) + } +} + +impl ContextProviderExtension for MetadataProvider { + fn get_table_source_(&self, table_ref: TableReference) -> datafusion::common::Result> { + let name = table_ref.clone().resolve("", ""); + let table_name = &*name.table; + + let table_handle = self.build_table_handle()?; + + Ok(Arc::new(TableSourceAdapter::try_new(table_ref.clone(), table_name, table_handle)?)) + } +} + +impl ContextProvider for MetadataProvider { + fn get_function_meta(&self, name: &str) -> Option> { + self.func_manager + .udf(name) + .ok() + .or(self.session.inner().scalar_functions().get(name).cloned()) + } + + fn get_aggregate_meta(&self, name: &str) -> Option> { + self.func_manager.udaf(name).ok() + } + + fn get_variable_type(&self, variable_names: &[String]) -> Option { + if variable_names.is_empty() { + return None; + } + + let var_type = if is_system_variables(variable_names) { + VarType::System + } else { + VarType::UserDefined + }; + + self.session + .inner() + .execution_props() + .get_var_provider(var_type) + .and_then(|p| p.get_type(variable_names)) + } + + fn options(&self) -> &ConfigOptions { + // TODO refactor + &self.config_options + } + + fn get_window_meta(&self, name: &str) -> Option> { + self.func_manager.udwf(name).ok() + } + + fn get_table_source(&self, name: TableReference) -> DFResult> { + Ok(self.get_table_source_(name)?) + } + + fn udf_names(&self) -> Vec { + todo!() + } + + fn udaf_names(&self) -> Vec { + todo!() + } + + fn udwf_names(&self) -> Vec { + todo!() + } +} diff --git a/s3select/query/src/sql/analyzer.rs b/s3select/query/src/sql/analyzer.rs new file mode 100644 index 00000000..6507c842 --- /dev/null +++ b/s3select/query/src/sql/analyzer.rs @@ -0,0 +1,33 @@ +use api::query::analyzer::Analyzer; +use api::query::session::SessionCtx; +use api::QueryResult; +use datafusion::logical_expr::LogicalPlan; +use datafusion::optimizer::analyzer::Analyzer as DFAnalyzer; + +pub struct DefaultAnalyzer { + inner: DFAnalyzer, +} + +impl DefaultAnalyzer { + pub fn new() -> Self { + let analyzer = DFAnalyzer::default(); + // we can add analyzer rule at here + + Self { inner: analyzer } + } +} + +impl Default for DefaultAnalyzer { + fn default() -> Self { + Self::new() + } +} + +impl Analyzer for DefaultAnalyzer { + fn analyze(&self, plan: &LogicalPlan, session: &SessionCtx) -> QueryResult { + let plan = self + .inner + .execute_and_check(plan.to_owned(), session.inner().config_options(), |_, _| {})?; + Ok(plan) + } +} diff --git a/s3select/query/src/sql/dialect.rs b/s3select/query/src/sql/dialect.rs new file mode 100644 index 00000000..33297093 --- /dev/null +++ b/s3select/query/src/sql/dialect.rs @@ -0,0 +1,18 @@ +use datafusion::sql::sqlparser::dialect::Dialect; + +#[derive(Debug, Default)] +pub struct RustFsDialect; + +impl Dialect for RustFsDialect { + fn is_identifier_start(&self, ch: char) -> bool { + ch.is_alphabetic() || ch == '_' || ch == '#' || ch == '@' + } + + fn is_identifier_part(&self, ch: char) -> bool { + ch.is_alphabetic() || ch.is_ascii_digit() || ch == '@' || ch == '$' || ch == '#' || ch == '_' + } + + fn supports_group_by_expr(&self) -> bool { + true + } +} diff --git a/s3select/query/src/sql/logical/mod.rs b/s3select/query/src/sql/logical/mod.rs new file mode 100644 index 00000000..1ecfae43 --- /dev/null +++ b/s3select/query/src/sql/logical/mod.rs @@ -0,0 +1,2 @@ +pub mod optimizer; +pub mod planner; diff --git a/s3select/query/src/sql/logical/optimizer.rs b/s3select/query/src/sql/logical/optimizer.rs new file mode 100644 index 00000000..e97e2967 --- /dev/null +++ b/s3select/query/src/sql/logical/optimizer.rs @@ -0,0 +1,111 @@ +use std::sync::Arc; + +use api::{ + query::{analyzer::AnalyzerRef, logical_planner::QueryPlan, session::SessionCtx}, + QueryResult, +}; +use datafusion::{ + execution::SessionStateBuilder, + logical_expr::LogicalPlan, + optimizer::{ + common_subexpr_eliminate::CommonSubexprEliminate, decorrelate_predicate_subquery::DecorrelatePredicateSubquery, + eliminate_cross_join::EliminateCrossJoin, eliminate_duplicated_expr::EliminateDuplicatedExpr, + eliminate_filter::EliminateFilter, eliminate_join::EliminateJoin, eliminate_limit::EliminateLimit, + eliminate_outer_join::EliminateOuterJoin, extract_equijoin_predicate::ExtractEquijoinPredicate, + filter_null_join_keys::FilterNullJoinKeys, propagate_empty_relation::PropagateEmptyRelation, + push_down_filter::PushDownFilter, push_down_limit::PushDownLimit, + replace_distinct_aggregate::ReplaceDistinctWithAggregate, scalar_subquery_to_join::ScalarSubqueryToJoin, + simplify_expressions::SimplifyExpressions, single_distinct_to_groupby::SingleDistinctToGroupBy, + unwrap_cast_in_comparison::UnwrapCastInComparison, OptimizerRule, + }, +}; +use tracing::debug; + +use crate::sql::analyzer::DefaultAnalyzer; + +pub trait LogicalOptimizer: Send + Sync { + fn optimize(&self, plan: &QueryPlan, session: &SessionCtx) -> QueryResult; + + fn inject_optimizer_rule(&mut self, optimizer_rule: Arc); +} + +pub struct DefaultLogicalOptimizer { + // fit datafusion + // TODO refactor + analyzer: AnalyzerRef, + rules: Vec>, +} + +impl DefaultLogicalOptimizer { + #[allow(dead_code)] + fn with_optimizer_rules(mut self, rules: Vec>) -> Self { + self.rules = rules; + self + } +} + +impl Default for DefaultLogicalOptimizer { + fn default() -> Self { + let analyzer = Arc::new(DefaultAnalyzer::default()); + + // additional optimizer rule + let rules: Vec> = vec![ + // df default rules start + Arc::new(SimplifyExpressions::new()), + Arc::new(UnwrapCastInComparison::new()), + Arc::new(ReplaceDistinctWithAggregate::new()), + Arc::new(EliminateJoin::new()), + Arc::new(DecorrelatePredicateSubquery::new()), + Arc::new(ScalarSubqueryToJoin::new()), + Arc::new(ExtractEquijoinPredicate::new()), + // simplify expressions does not simplify expressions in subqueries, so we + // run it again after running the optimizations that potentially converted + // subqueries to joins + Arc::new(SimplifyExpressions::new()), + Arc::new(EliminateDuplicatedExpr::new()), + Arc::new(EliminateFilter::new()), + Arc::new(EliminateCrossJoin::new()), + Arc::new(CommonSubexprEliminate::new()), + Arc::new(EliminateLimit::new()), + Arc::new(PropagateEmptyRelation::new()), + Arc::new(FilterNullJoinKeys::default()), + Arc::new(EliminateOuterJoin::new()), + // Filters can't be pushed down past Limits, we should do PushDownFilter after PushDownLimit + Arc::new(PushDownLimit::new()), + Arc::new(PushDownFilter::new()), + Arc::new(SingleDistinctToGroupBy::new()), + // The previous optimizations added expressions and projections, + // that might benefit from the following rules + Arc::new(SimplifyExpressions::new()), + Arc::new(UnwrapCastInComparison::new()), + Arc::new(CommonSubexprEliminate::new()), + // PushDownProjection can pushdown Projections through Limits, do PushDownLimit again. + Arc::new(PushDownLimit::new()), + // df default rules end + // custom rules can add at here + ]; + + Self { analyzer, rules } + } +} + +impl LogicalOptimizer for DefaultLogicalOptimizer { + fn optimize(&self, plan: &QueryPlan, session: &SessionCtx) -> QueryResult { + let analyzed_plan = { self.analyzer.analyze(&plan.df_plan, session)? }; + + debug!("Analyzed logical plan:\n{}\n", plan.df_plan.display_indent_schema(),); + + let optimizeed_plan = { + SessionStateBuilder::new_from_existing(session.inner().clone()) + .with_optimizer_rules(self.rules.clone()) + .build() + .optimize(&analyzed_plan)? + }; + + Ok(optimizeed_plan) + } + + fn inject_optimizer_rule(&mut self, optimizer_rule: Arc) { + self.rules.push(optimizer_rule); + } +} diff --git a/s3select/query/src/sql/logical/planner.rs b/s3select/query/src/sql/logical/planner.rs new file mode 100644 index 00000000..bcdc59c0 --- /dev/null +++ b/s3select/query/src/sql/logical/planner.rs @@ -0,0 +1,3 @@ +use crate::sql::planner::SqlPlanner; + +pub type DefaultLogicalPlanner<'a, S> = SqlPlanner<'a, S>; diff --git a/s3select/query/src/sql/mod.rs b/s3select/query/src/sql/mod.rs new file mode 100644 index 00000000..151fc83d --- /dev/null +++ b/s3select/query/src/sql/mod.rs @@ -0,0 +1,7 @@ +pub mod analyzer; +pub mod dialect; +pub mod logical; +pub mod optimizer; +pub mod parser; +pub mod physical; +pub mod planner; diff --git a/s3select/query/src/sql/optimizer.rs b/s3select/query/src/sql/optimizer.rs new file mode 100644 index 00000000..b424b073 --- /dev/null +++ b/s3select/query/src/sql/optimizer.rs @@ -0,0 +1,82 @@ +use std::sync::Arc; + +use api::{ + query::{logical_planner::QueryPlan, optimizer::Optimizer, physical_planner::PhysicalPlanner, session::SessionCtx}, + QueryResult, +}; +use async_trait::async_trait; +use datafusion::physical_plan::{displayable, ExecutionPlan}; +use tracing::debug; + +use super::{ + logical::optimizer::{DefaultLogicalOptimizer, LogicalOptimizer}, + physical::{optimizer::PhysicalOptimizer, planner::DefaultPhysicalPlanner}, +}; + +pub struct CascadeOptimizer { + logical_optimizer: Arc, + physical_planner: Arc, + physical_optimizer: Arc, +} + +#[async_trait] +impl Optimizer for CascadeOptimizer { + async fn optimize(&self, plan: &QueryPlan, session: &SessionCtx) -> QueryResult> { + debug!("Original logical plan:\n{}\n", plan.df_plan.display_indent_schema(),); + + let optimized_logical_plan = self.logical_optimizer.optimize(plan, session)?; + + debug!("Final logical plan:\n{}\n", optimized_logical_plan.display_indent_schema(),); + + let physical_plan = { + self.physical_planner + .create_physical_plan(&optimized_logical_plan, session) + .await? + }; + + debug!("Original physical plan:\n{}\n", displayable(physical_plan.as_ref()).indent(false)); + + let optimized_physical_plan = { self.physical_optimizer.optimize(physical_plan, session)? }; + + Ok(optimized_physical_plan) + } +} + +#[derive(Default)] +pub struct CascadeOptimizerBuilder { + logical_optimizer: Option>, + physical_planner: Option>, + physical_optimizer: Option>, +} + +impl CascadeOptimizerBuilder { + pub fn with_logical_optimizer(mut self, logical_optimizer: Arc) -> Self { + self.logical_optimizer = Some(logical_optimizer); + self + } + + pub fn with_physical_planner(mut self, physical_planner: Arc) -> Self { + self.physical_planner = Some(physical_planner); + self + } + + pub fn with_physical_optimizer(mut self, physical_optimizer: Arc) -> Self { + self.physical_optimizer = Some(physical_optimizer); + self + } + + pub fn build(self) -> CascadeOptimizer { + let default_logical_optimizer = Arc::new(DefaultLogicalOptimizer::default()); + let default_physical_planner = Arc::new(DefaultPhysicalPlanner::default()); + + let logical_optimizer = self.logical_optimizer.unwrap_or(default_logical_optimizer); + let physical_planner = self.physical_planner.unwrap_or_else(|| default_physical_planner.clone()); + let physical_optimizer = self.physical_optimizer.unwrap_or(default_physical_planner); + + CascadeOptimizer { + logical_optimizer, + physical_planner, + physical_optimizer, + } + } +} diff --git a/s3select/query/src/sql/parser.rs b/s3select/query/src/sql/parser.rs new file mode 100644 index 00000000..ebd2b5d4 --- /dev/null +++ b/s3select/query/src/sql/parser.rs @@ -0,0 +1,92 @@ +use std::{collections::VecDeque, fmt::Display}; + +use api::{ + query::{ast::ExtStatement, parser::Parser as RustFsParser}, + ParserSnafu, +}; +use datafusion::sql::sqlparser::{ + dialect::Dialect, + parser::{Parser, ParserError}, + tokenizer::{Token, Tokenizer}, +}; +use snafu::ResultExt; + +use super::dialect::RustFsDialect; + +pub type Result = std::result::Result; + +// Use `Parser::expected` instead, if possible +macro_rules! parser_err { + ($MSG:expr) => { + Err(ParserError::ParserError($MSG.to_string())) + }; +} + +#[derive(Default)] +pub struct DefaultParser {} + +impl RustFsParser for DefaultParser { + fn parse(&self, sql: &str) -> api::QueryResult> { + ExtParser::parse_sql(sql).context(ParserSnafu) + } +} + +/// SQL Parser +pub struct ExtParser<'a> { + parser: Parser<'a>, +} + +impl<'a> ExtParser<'a> { + /// Parse the specified tokens with dialect + fn new_with_dialect(sql: &str, dialect: &'a dyn Dialect) -> Result { + let mut tokenizer = Tokenizer::new(dialect, sql); + let tokens = tokenizer.tokenize()?; + Ok(ExtParser { + parser: Parser::new(dialect).with_tokens(tokens), + }) + } + + /// Parse a SQL statement and produce a set of statements + pub fn parse_sql(sql: &str) -> Result> { + let dialect = &RustFsDialect {}; + ExtParser::parse_sql_with_dialect(sql, dialect) + } + + /// Parse a SQL statement and produce a set of statements + pub fn parse_sql_with_dialect(sql: &str, dialect: &dyn Dialect) -> Result> { + let mut parser = ExtParser::new_with_dialect(sql, dialect)?; + let mut stmts = VecDeque::new(); + let mut expecting_statement_delimiter = false; + loop { + // ignore empty statements (between successive statement delimiters) + while parser.parser.consume_token(&Token::SemiColon) { + expecting_statement_delimiter = false; + } + + if parser.parser.peek_token() == Token::EOF { + break; + } + if expecting_statement_delimiter { + return parser.expected("end of statement", parser.parser.peek_token()); + } + + let statement = parser.parse_statement()?; + stmts.push_back(statement); + expecting_statement_delimiter = true; + } + + // debug!("Parser sql: {}, stmts: {:#?}", sql, stmts); + + Ok(stmts) + } + + /// Parse a new expression + fn parse_statement(&mut self) -> Result { + Ok(ExtStatement::SqlStatement(Box::new(self.parser.parse_statement()?))) + } + + // Report unexpected token + fn expected(&self, expected: &str, found: impl Display) -> Result { + parser_err!(format!("Expected {}, found: {}", expected, found)) + } +} diff --git a/s3select/query/src/sql/physical/mod.rs b/s3select/query/src/sql/physical/mod.rs new file mode 100644 index 00000000..1ecfae43 --- /dev/null +++ b/s3select/query/src/sql/physical/mod.rs @@ -0,0 +1,2 @@ +pub mod optimizer; +pub mod planner; diff --git a/s3select/query/src/sql/physical/optimizer.rs b/s3select/query/src/sql/physical/optimizer.rs new file mode 100644 index 00000000..12f16e3d --- /dev/null +++ b/s3select/query/src/sql/physical/optimizer.rs @@ -0,0 +1,12 @@ +use std::sync::Arc; + +use api::query::session::SessionCtx; +use api::QueryResult; +use datafusion::physical_optimizer::PhysicalOptimizerRule; +use datafusion::physical_plan::ExecutionPlan; + +pub trait PhysicalOptimizer { + fn optimize(&self, plan: Arc, session: &SessionCtx) -> QueryResult>; + + fn inject_optimizer_rule(&mut self, optimizer_rule: Arc); +} diff --git a/s3select/query/src/sql/physical/planner.rs b/s3select/query/src/sql/physical/planner.rs new file mode 100644 index 00000000..254c198d --- /dev/null +++ b/s3select/query/src/sql/physical/planner.rs @@ -0,0 +1,104 @@ +use std::sync::Arc; + +use api::query::physical_planner::PhysicalPlanner; +use api::query::session::SessionCtx; +use api::QueryResult; +use async_trait::async_trait; +use datafusion::execution::SessionStateBuilder; +use datafusion::logical_expr::LogicalPlan; +use datafusion::physical_optimizer::aggregate_statistics::AggregateStatistics; +use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; +use datafusion::physical_optimizer::join_selection::JoinSelection; +use datafusion::physical_optimizer::PhysicalOptimizerRule; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_planner::{ + DefaultPhysicalPlanner as DFDefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner as DFPhysicalPlanner, +}; + +use super::optimizer::PhysicalOptimizer; + +pub struct DefaultPhysicalPlanner { + ext_physical_transform_rules: Vec>, + /// Responsible for optimizing a physical execution plan + ext_physical_optimizer_rules: Vec>, +} + +impl DefaultPhysicalPlanner { + #[allow(dead_code)] + fn with_physical_transform_rules(mut self, rules: Vec>) -> Self { + self.ext_physical_transform_rules = rules; + self + } +} + +impl DefaultPhysicalPlanner { + #[allow(dead_code)] + fn with_optimizer_rules(mut self, rules: Vec>) -> Self { + self.ext_physical_optimizer_rules = rules; + self + } +} + +impl Default for DefaultPhysicalPlanner { + fn default() -> Self { + let ext_physical_transform_rules: Vec> = vec![ + // can add rules at here + ]; + + // We need to take care of the rule ordering. They may influence each other. + let ext_physical_optimizer_rules: Vec> = vec![ + Arc::new(AggregateStatistics::new()), + // Statistics-based join selection will change the Auto mode to a real join implementation, + // like collect left, or hash join, or future sort merge join, which will influence the + // EnforceDistribution and EnforceSorting rules as they decide whether to add additional + // repartitioning and local sorting steps to meet distribution and ordering requirements. + // Therefore, it should run before EnforceDistribution and EnforceSorting. + Arc::new(JoinSelection::new()), + // The CoalesceBatches rule will not influence the distribution and ordering of the + // whole plan tree. Therefore, to avoid influencing other rules, it should run last. + Arc::new(CoalesceBatches::new()), + ]; + + Self { + ext_physical_transform_rules, + ext_physical_optimizer_rules, + } + } +} + +#[async_trait] +impl PhysicalPlanner for DefaultPhysicalPlanner { + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + session: &SessionCtx, + ) -> QueryResult> { + // 将扩展的物理计划优化规则注入df 的 session state + let new_state = SessionStateBuilder::new_from_existing(session.inner().clone()) + .with_physical_optimizer_rules(self.ext_physical_optimizer_rules.clone()) + .build(); + + // 通过扩展的物理计划转换规则构造df 的 Physical Planner + let planner = DFDefaultPhysicalPlanner::with_extension_planners(self.ext_physical_transform_rules.clone()); + + // 执行df的物理计划规划及优化 + planner + .create_physical_plan(logical_plan, &new_state) + .await + .map_err(|e| e.into()) + } + + fn inject_physical_transform_rule(&mut self, rule: Arc) { + self.ext_physical_transform_rules.push(rule) + } +} + +impl PhysicalOptimizer for DefaultPhysicalPlanner { + fn optimize(&self, plan: Arc, _session: &SessionCtx) -> QueryResult> { + Ok(plan) + } + + fn inject_optimizer_rule(&mut self, optimizer_rule: Arc) { + self.ext_physical_optimizer_rules.push(optimizer_rule); + } +} diff --git a/s3select/query/src/sql/planner.rs b/s3select/query/src/sql/planner.rs new file mode 100644 index 00000000..a6c9f8c1 --- /dev/null +++ b/s3select/query/src/sql/planner.rs @@ -0,0 +1,60 @@ +use api::{ + query::{ + ast::ExtStatement, + logical_planner::{LogicalPlanner, Plan, QueryPlan}, + session::SessionCtx, + }, + QueryError, QueryResult, +}; +use async_recursion::async_recursion; +use async_trait::async_trait; +use datafusion::sql::{planner::SqlToRel, sqlparser::ast::Statement}; + +use crate::metadata::ContextProviderExtension; + +pub struct SqlPlanner<'a, S: ContextProviderExtension> { + _schema_provider: &'a S, + df_planner: SqlToRel<'a, S>, +} + +#[async_trait] +impl LogicalPlanner for SqlPlanner<'_, S> { + async fn create_logical_plan(&self, statement: ExtStatement, session: &SessionCtx) -> QueryResult { + let plan = { self.statement_to_plan(statement, session).await? }; + + Ok(plan) + } +} + +impl<'a, S: ContextProviderExtension + Send + Sync + 'a> SqlPlanner<'a, S> { + /// Create a new query planner + pub fn new(schema_provider: &'a S) -> Self { + SqlPlanner { + _schema_provider: schema_provider, + df_planner: SqlToRel::new(schema_provider), + } + } + + /// Generate a logical plan from an Extent SQL statement + #[async_recursion] + pub(crate) async fn statement_to_plan(&self, statement: ExtStatement, session: &SessionCtx) -> QueryResult { + match statement { + ExtStatement::SqlStatement(stmt) => self.df_sql_to_plan(*stmt, session).await, + } + } + + async fn df_sql_to_plan(&self, stmt: Statement, _session: &SessionCtx) -> QueryResult { + match stmt { + Statement::Query(_) => { + let df_plan = self.df_planner.sql_statement_to_plan(stmt)?; + let plan = Plan::Query(QueryPlan { + df_plan, + is_tag_scan: false, + }); + + Ok(plan) + } + _ => Err(QueryError::NotImplemented { err: stmt.to_string() }), + } + } +}