diff --git a/Cargo.lock b/Cargo.lock index c7c50f5047e..adf9aeb4a51 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -196,25 +196,57 @@ version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f02882884d3e1bc524fb12c79f107f6ad0e1cfd498c536ffb494301740995dfe" +[[package]] +name = "arrow" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb98341a7e051bb79731ecb33ec00cbd6e0e315a542d6732b46d462c9215ea2" +dependencies = [ + "arrow-arith 56.2.1", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-cast 56.2.1", + "arrow-data 56.2.1", + "arrow-ord 56.2.1", + "arrow-row 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "arrow-string 56.2.1", +] + [[package]] name = "arrow" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", "arrow-csv", - "arrow-data", - "arrow-ipc", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", +] + +[[package]] +name = "arrow-arith" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce4751cbc4bcccfeeea79df9571ff1dc066d61e44723c7604d11c7937f5b560" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "chrono", + "num", ] [[package]] @@ -223,14 +255,30 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "num-traits", ] +[[package]] +name = "arrow-array" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b02ccba2e977a3aabb4384036109ca32f552399a2bc0588f925f91ed073ce70c" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "chrono", + "half", + "hashbrown 0.16.1", + "num", +] + [[package]] name = "arrow-array" version = "58.3.0" @@ -238,9 +286,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash 0.8.12", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "chrono-tz", "half", @@ -256,9 +304,9 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "049230728cd6e093088c8d231b4beede184e35cad7777c1505c0d5a8571f4376" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "bytes", "bzip2", "crc", @@ -274,6 +322,17 @@ dependencies = [ "zstd", ] +[[package]] +name = "arrow-buffer" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90f8bece6a9ee316a699fbbfde368a206676a1206ce89b50f07937648e76c3c" +dependencies = [ + "bytes", + "half", + "num", +] + [[package]] name = "arrow-buffer" version = "58.3.0" @@ -286,18 +345,39 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-cast" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61ffe645cfb4e80b1ca37a3a106ce7b4af66ccdd60c655a57e6b9aab096164a7" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + [[package]] name = "arrow-cast" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "atoi", "base64", "chrono", @@ -314,41 +394,67 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "csv", "csv-core", "regex", ] +[[package]] +name = "arrow-data" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78468c813909465dd0f858950c8a0614eb63608134acf95c602ec21381258b28" +dependencies = [ + "arrow-buffer 56.2.1", + "arrow-schema 56.2.1", + "half", + "num", +] + [[package]] name = "arrow-data" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "half", "num-integer", "num-traits", ] +[[package]] +name = "arrow-ipc" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f88b0fbb33af28089ccd3e4dcd0ff09de46842168d00220b920f7231feddf5" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "flatbuffers", +] + [[package]] name = "arrow-ipc" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "flatbuffers", - "lz4_flex", + "lz4_flex 0.13.1", "zstd", ] @@ -358,12 +464,12 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -377,17 +483,43 @@ dependencies = [ "simdutf8", ] +[[package]] +name = "arrow-ord" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed58a38c3db0a2cf75ef70e3cb6bc4bd0da0a3d390de37c36139b31fae826e8" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", +] + [[package]] name = "arrow-ord" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", +] + +[[package]] +name = "arrow-row" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "079ced0517daf4f09b070d09ff641cee7cc331aa216bebcb25d1a6474ad53086" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "half", ] [[package]] @@ -396,13 +528,19 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "half", ] +[[package]] +name = "arrow-schema" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a0d5eb3fe25337ff83e8333a08379bdd1540b0961b1c888f6e505d971c198e1" + [[package]] name = "arrow-schema" version = "58.3.0" @@ -414,6 +552,20 @@ dependencies = [ "serde_json", ] +[[package]] +name = "arrow-select" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2368a78bd32902dba39d52519d70f63799c8b5dc8a9477129a30c2fd3dc70c19" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "num", +] + [[package]] name = "arrow-select" version = "58.3.0" @@ -421,24 +573,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "num-traits", ] +[[package]] +name = "arrow-string" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dece58a130b9187756ded8bc071bd8ee9dd7a146566af244b297c7e632fd1ef7" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "memchr", + "num", + "regex", + "regex-syntax", +] + [[package]] name = "arrow-string" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "memchr", "num-traits", "regex", @@ -1391,11 +1560,12 @@ dependencies = [ [[package]] name = "comfy-table" -version = "7.2.2" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "unicode-segmentation", + "strum 0.26.3", + "strum_macros 0.26.4", "unicode-width 0.2.2", ] @@ -1418,8 +1588,8 @@ name = "compress-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "clap", @@ -1427,7 +1597,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "lance-bench", - "parquet", + "parquet 58.3.0", "regex", "tokio", "tracing", @@ -1943,8 +2113,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "chrono", @@ -1992,8 +2162,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "997a31e15872606a49478e670c58302094c97cb96abb0a7d60720f8e92170040" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bzip2", "chrono", @@ -2031,7 +2201,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", + "parquet 58.3.0", "sqlparser 0.62.0", "tempfile", "tokio", @@ -2072,7 +2242,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "dashmap", "datafusion-common 53.1.0", @@ -2097,7 +2267,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7dd61161508f8f5fa1107774ea687bd753c22d83a32eebf963549f89de14139" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "dashmap", "datafusion-common 54.0.0", @@ -2122,7 +2292,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 53.1.0", "datafusion-common 53.1.0", @@ -2145,7 +2315,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897c70f871277f9ce99aa38347be0d679bbe3e617156c4d2a8378cec8a2a0891" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 54.0.0", "datafusion-common 54.0.0", @@ -2169,8 +2339,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash 0.8.12", - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "chrono", "half", "hashbrown 0.16.1", @@ -2191,9 +2361,9 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c9ded5d87d9172319e006f2afdb9928d72dbacd6a90a458d8acb1e3b43a65" dependencies = [ - "arrow", - "arrow-ipc", - "arrow-schema", + "arrow 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "chrono", "foldhash 0.2.0", "half", @@ -2203,7 +2373,7 @@ dependencies = [ "libc", "log", "object_store", - "parquet", + "parquet 58.3.0", "recursive", "sqlparser 0.62.0", "tokio", @@ -2239,7 +2409,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "chrono", @@ -2268,7 +2438,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd7d295b2ec7c00d8a56562f41ed41062cf0af75549ed891c12a0a09eddfefe" dependencies = [ - "arrow", + "arrow 58.3.0", "async-compression", "async-trait", "bytes", @@ -2304,8 +2474,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2328,8 +2498,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552b0b3f342f7ec41b3fbd70f6339dc82a30cfd0349e7f280e7852528085349f" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2352,7 +2522,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb517d08967d536284ce70afb5fe8583133779249f2d7b90587d339741a7f195" dependencies = [ - "arrow", + "arrow 58.3.0", "arrow-avro", "async-trait", "bytes", @@ -2371,7 +2541,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2394,7 +2564,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68850aa426b897e879c8b87e512ea8124f1d0a2869a4e51808ddaaddf1bc0ada" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2417,7 +2587,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2441,7 +2611,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "402f93242ae08ef99139ee2c528a49d087efe88d5c7b2c3ff5480855a40ce54f" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2464,7 +2634,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd2499c1bee0eeccf6a57156105700eeeb17bc701899ac719183c4e74231450" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2485,7 +2655,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", + "parquet 58.3.0", "tokio", ] @@ -2507,8 +2677,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "async-trait", "chrono", "dashmap", @@ -2530,8 +2700,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37a8643ab852eb68864e1b72ae789e8066282dce48eea6347ffb0aee33d1ccc0" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "async-trait", "dashmap", "datafusion-common 54.0.0", @@ -2552,7 +2722,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "chrono", "datafusion-common 53.1.0", @@ -2574,8 +2744,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6932f4d71eed9c8d9341476a2b845aadfabde5495d08dbcd8fc23881f49fa7a0" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "chrono", "datafusion-common 54.0.0", @@ -2597,7 +2767,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "indexmap 2.14.0", "itertools 0.14.0", @@ -2610,7 +2780,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0225491839a31b1f7d2cb8092c2d50792e2fe1c1724e4e6d08e011f5feaf4ed2" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "indexmap 2.14.0", "itertools 0.14.0", @@ -2622,8 +2792,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "base64", "blake2", "blake3", @@ -2654,8 +2824,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14872c47bfc3d21e53ec82f57074e6987a15941c1e2f43cde4ac6ae2746634e3" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "base64", "blake2", "blake3", @@ -2687,7 +2857,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-execution 53.1.0", @@ -2708,7 +2878,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75a2ca14e1b609be21e657e2d3130b2f446456b08393b377bb721a33952d2e09" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-execution 54.0.0", @@ -2730,7 +2900,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr-common 53.1.0", "datafusion-physical-expr-common 53.1.0", @@ -2742,7 +2912,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ece74ba09092d2ef9c9b54a38445450aea292a1f8b04faf531936b723a24b3c" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr-common 54.0.0", "datafusion-physical-expr-common 54.0.0", @@ -2754,8 +2924,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.3.0", + "arrow-ord 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-execution 53.1.0", @@ -2779,8 +2949,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f3e3f9ee8ca59bf70518802107de6f1b88a9509efdc629fadc5de9d6b2d5ef5" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.3.0", + "arrow-ord 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-execution 54.0.0", @@ -2804,7 +2974,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 53.1.0", "datafusion-common 53.1.0", @@ -2820,7 +2990,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89161dffc22cf2b50f9f4b1bee83b5221d3b4ed7c2e37fd7aa2b22a5297b3a26" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 54.0.0", "datafusion-common 54.0.0", @@ -2836,7 +3006,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-expr 53.1.0", @@ -2854,7 +3024,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7339345b226b3874037708bf5023ba1c2de705128f8457a095aae5ae9cb9c78" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-expr 54.0.0", @@ -2913,7 +3083,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", @@ -2932,7 +3102,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77f20e8cf9e8654d92f4c16b24c487353ee5bf153ffc12d5772cd399ab8cd281" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", @@ -2953,7 +3123,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", "datafusion-expr-common 53.1.0", @@ -2975,7 +3145,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f015a4a82f6f7ff7e1d8d4bf3870a936752fa38b17705dfcc14adef95aa8922c" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", "datafusion-expr-common 54.0.0", @@ -2997,7 +3167,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", "datafusion-functions 53.1.0", @@ -3012,7 +3182,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51e6ffff8acdfe54e0ea15ccf38115c4a9184433b0439f42907637928d00a235" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", "datafusion-functions 54.0.0", @@ -3028,7 +3198,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 53.1.0", "datafusion-expr-common 53.1.0", @@ -3044,7 +3214,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7967a3e171c6a4bf09474b3f7a14f1a3db13ed1714ba12156f33fcce2bba54e8" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 54.0.0", "datafusion-expr-common 54.0.0", @@ -3061,7 +3231,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-execution 53.1.0", "datafusion-expr 53.1.0", @@ -3079,7 +3249,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59ff803e2a96054cb6d83f35f9e60fd4f42eac515e1932bd1b2dbc91d5fcbf36" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-execution 54.0.0", "datafusion-expr 54.0.0", @@ -3099,9 +3269,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash 0.8.12", - "arrow", - "arrow-ord", - "arrow-schema", + "arrow 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion-common 53.1.0", "datafusion-common-runtime 53.1.0", @@ -3130,11 +3300,11 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "776ee54d47d15bdb126452f9ca17b03761e3b004682914beaedd3f86eb507fbc" dependencies = [ - "arrow", - "arrow-data", - "arrow-ipc", - "arrow-ord", - "arrow-schema", + "arrow 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion-common 54.0.0", "datafusion-common-runtime 54.0.0", @@ -3163,7 +3333,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-datasource 53.1.0", "datafusion-expr-common 53.1.0", @@ -3180,7 +3350,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5fb9e5774660aa69c3ba93c610f175f75b65cb8c3776edb3626de8f3a4f4ee3" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-datasource 54.0.0", "datafusion-expr-common 54.0.0", @@ -3224,7 +3394,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "390bb0bf37cb2b95ffd65eacd66f60df50793d1f94097799e416f39477a51957" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "crc32fast", @@ -3254,7 +3424,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "datafusion-common 53.1.0", @@ -3272,7 +3442,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6094ad36a3ed6d7ac87b20b479b2d0b118250f66cf997603829fdc65b44a7099" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "datafusion-common 54.0.0", @@ -3291,7 +3461,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0c08025966108056d3547d879c4d39e246277494f59ca12920f78187d95eea1" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bigdecimal", "clap", @@ -3833,7 +4003,7 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcd0ce0249ac12fd44fcde62d435c36d881952c2f0df4d1de24b45e1dbba5ddb" dependencies = [ - "arrow-array", + "arrow-array 58.3.0", "rand 0.9.4", ] @@ -4058,14 +4228,27 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dafe7b7de3fab1a8b7099fd6a6434ca955fa65065f9c19f0f8a133693f3c2b0e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "geo-traits", "geoarrow-schema", "num-traits", "wkb", - "wkt", + "wkt 0.14.0", +] + +[[package]] +name = "geoarrow-cast" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41c308d653690a4e8ef3cbba69696056bd819e624766ece66d64cc26a638acc1" +dependencies = [ + "arrow-schema 58.3.0", + "geo-traits", + "geoarrow-array", + "geoarrow-schema", + "wkt 0.14.0", ] [[package]] @@ -4074,7 +4257,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d4a7edb2a1d87024a93805332a9c8184a0354836271d42c0d18cf628a5e3cd0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "geo-traits", "serde", "serde_json", @@ -4090,6 +4273,33 @@ dependencies = [ "libm", ] +[[package]] +name = "geojson" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e26f3c45b36fccc9cf2805e61d4da6bc4bbd5a3a9589b01afa3a40eff703bd79" +dependencies = [ + "log", + "serde", + "serde_json", + "thiserror 2.0.18", +] + +[[package]] +name = "geozero" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5f28f34864745eb2f123c990c6ffd92c1584bd39439b3f27ff2a0f4ea5b309b" +dependencies = [ + "geo-types", + "geojson", + "log", + "scroll", + "serde_json", + "thiserror 1.0.69", + "wkt 0.11.1", +] + [[package]] name = "get_dir" version = "0.5.0" @@ -5014,16 +5224,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3944aca86f4c78f4da04af1c2bf33e664a2826b7af72972ad200d6b9de59019f" dependencies = [ "arc-swap", - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ipc", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "async_cell", @@ -5085,13 +5295,13 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "253f4a0a70580c985b91e65e9ca6cad644825a4078de28d8efbacf3ffbd7ecdc" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytes", "futures", "getrandom 0.2.17", @@ -5106,13 +5316,13 @@ name = "lance-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-cast", + "arrow-cast 58.3.0", "async-trait", "clap", "futures", "lance", "lance-encoding", - "parquet", + "parquet 58.3.0", "tempfile", "tokio", "tracing", @@ -5136,9 +5346,9 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13f84020da5a484e2f07dd1796e09785ed7cd889857ebc4cb77e32ef214ee594" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -5173,13 +5383,13 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7460597a66534a75987993d4dac5bc330586d99c5b79ae73367dbcbd4e29e576" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "chrono", "datafusion 53.1.0", @@ -5205,10 +5415,10 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "046f5506ed2271cd941a050de7bf535dd3aedc291aadec836a63fa56c5926e3b" dependencies = [ - "arrow", - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "futures", "half", @@ -5225,13 +5435,13 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7af54edf43dcf9d6a56cc636eb35d457e68373c6448dca3f0891b3325b4a24e6" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytemuck", "byteorder", "bytes", @@ -5262,12 +5472,12 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0772ae2d6207995dc1eb28aff9507f78e90b3362b58f311da001e9dc25f3d736" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -5296,12 +5506,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e71fbfb51096a903cb524fe0da716f5f15fbc4a6b6f84cd6dec21abf319c5e84" dependencies = [ "arc-swap", - "arrow", - "arrow-arith", - "arrow-array", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-channel", "async-recursion", "async-trait", @@ -5360,14 +5570,14 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab8c98ef1b870b20541d27f3ca4efdf7c9f5c25214233be07d231ba88900219" dependencies = [ - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -5400,9 +5610,9 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b4c51cad0ac780b02dc4da48528479e7693c03e8d05390510bbc69ca2a9a1f1" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "cc", "deepsize", "half", @@ -5418,7 +5628,7 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "014e8332ca0615506342e0d3af608639864b68396973be14239f09c9f21f1fc2" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "lance-core", @@ -5446,11 +5656,11 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16f1355904aea4ebb04ffc70c58c97901e10bde44452b4b021de4a1f329250d" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-ipc", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -5793,6 +6003,15 @@ dependencies = [ "libc", ] +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +dependencies = [ + "twox-hash", +] + [[package]] name = "lz4_flex" version = "0.13.1" @@ -6135,6 +6354,20 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -6169,6 +6402,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -6523,6 +6778,41 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parquet" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3abbfef8a25900f4925c86e4cb881ea24672ca3c31ee4fb50a8083c4c56d313" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-cast 56.2.1", + "arrow-data 56.2.1", + "arrow-ipc 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.11.6", + "num", + "num-bigint", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + [[package]] name = "parquet" version = "58.3.0" @@ -6530,12 +6820,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "base64", "brotli", "bytes", @@ -6544,7 +6834,7 @@ dependencies = [ "futures", "half", "hashbrown 0.17.1", - "lz4_flex", + "lz4_flex 0.13.1", "num-bigint", "num-integer", "num-traits", @@ -6565,8 +6855,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74c8db065291f088a2aad8ab831853eae1871c0d311c8d0b83bbc3b7e735d0fc" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -6581,8 +6871,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a530e8d5b5e14efcb39c9a6ec55432ad11f6afb7dc4455a79be0dc615fe3cc31" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -6598,7 +6888,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00ed89908289f67caa2ca078f9ff9aacd6229a313ec92b12bf4f48f613dc2b97" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "base64", "chrono", "parquet-variant", @@ -7989,6 +8279,12 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2" +[[package]] +name = "scroll" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04c565b551bafbef4157586fa379538366e4385d42082f255bfd96e4fe8519da" + [[package]] name = "seahash" version = "4.1.0" @@ -8410,6 +8706,30 @@ dependencies = [ "smallvec", ] +[[package]] +name = "spatialbench" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07f3f4b67ccf571f183d3695aa6b9d6f996864c31782a480e97a23ed0f2f6f18" +dependencies = [ + "geo", + "once_cell", + "rand 0.8.6", + "serde", +] + +[[package]] +name = "spatialbench-arrow" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad89c32ed9e258bcc89713c296c7437963ce31f511eb8a408d2046e853294206" +dependencies = [ + "arrow 56.2.1", + "geo", + "geozero", + "spatialbench", +] + [[package]] name = "sqllogictest" version = "0.29.1" @@ -9145,7 +9465,7 @@ name = "tpchgen-arrow" version = "2.0.2" source = "git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f#438e9c2dbc25b2fff82c0efc08b3f13b5707874f" dependencies = [ - "arrow", + "arrow 58.3.0", "tpchgen", ] @@ -9426,12 +9746,12 @@ name = "vortex" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", + "arrow-array 58.3.0", "codspeed-divan-compat", "fastlanes", "futures", "mimalloc", - "parquet", + "parquet 58.3.0", "paste", "rand 0.10.1", "rand_distr 0.6.0", @@ -9497,15 +9817,15 @@ dependencies = [ "arbitrary", "arc-swap", "arcref", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", "async-lock", "bytes", "cfg-if", @@ -9570,13 +9890,15 @@ name = "vortex-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "bzip2", "clap", "futures", + "geoarrow", + "geoarrow-cast", "get_dir", "glob", "humansize", @@ -9587,12 +9909,15 @@ dependencies = [ "noodles-bgzf", "noodles-vcf", "parking_lot", - "parquet", + "parquet 56.2.1", + "parquet 58.3.0", "rand 0.10.1", "regex", "reqwest 0.13.4", "serde", "serde_json", + "spatialbench", + "spatialbench-arrow", "sysinfo", "tabled", "target-lexicon", @@ -9607,6 +9932,7 @@ dependencies = [ "url", "uuid", "vortex", + "vortex-geo", "vortex-tensor", ] @@ -9645,7 +9971,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ - "arrow-buffer", + "arrow-buffer 58.3.0", "bitvec", "bytes", "codspeed-divan-compat", @@ -9677,13 +10003,13 @@ dependencies = [ name = "vortex-compat" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-select", + "arrow-array 58.3.0", + "arrow-select 58.3.0", "base16ct", "bytes", "clap", "futures", - "parquet", + "parquet 58.3.0", "reqwest 0.13.4", "serde", "serde_json", @@ -9725,11 +10051,11 @@ dependencies = [ name = "vortex-compute" version = "0.1.0" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-schema", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "num-traits", "rand 0.10.1", @@ -9752,7 +10078,7 @@ name = "vortex-cuda" version = "0.1.0" dependencies = [ "arc-swap", - "arrow-schema", + "arrow-schema 58.3.0", "async-trait", "bindgen", "bytes", @@ -9783,7 +10109,7 @@ dependencies = [ name = "vortex-cuda-ffi" version = "0.1.0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "futures", "vortex", "vortex-array", @@ -9805,8 +10131,8 @@ name = "vortex-cxx" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "cxx", "futures", @@ -9820,8 +10146,8 @@ name = "vortex-datafusion" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion 54.0.0", "datafusion-catalog 54.0.0", @@ -9920,7 +10246,7 @@ dependencies = [ name = "vortex-error" version = "0.1.0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "flatbuffers", "jiff", "object_store", @@ -9955,8 +10281,8 @@ dependencies = [ name = "vortex-ffi" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "bytes", "cbindgen", @@ -10077,16 +10403,18 @@ dependencies = [ name = "vortex-geo" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "geo", "geo-traits", "geo-types", "geoarrow", + "geoarrow-cast", "prost 0.14.4", "rstest", "vortex-array", "vortex-error", + "vortex-layout", "vortex-session", "wkb", ] @@ -10144,8 +10472,8 @@ dependencies = [ name = "vortex-jni" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "futures", "jni", @@ -10164,8 +10492,8 @@ dependencies = [ name = "vortex-json" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "prost 0.14.4", "vortex-array", "vortex-error", @@ -10178,8 +10506,8 @@ name = "vortex-layout" version = "0.1.0" dependencies = [ "arcref", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-stream", "async-trait", "bit-vec", @@ -10269,9 +10597,9 @@ dependencies = [ name = "vortex-parquet-variant" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "chrono", "parquet-variant", "parquet-variant-compute", @@ -10316,9 +10644,9 @@ dependencies = [ name = "vortex-python" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "async-fs", "bytes", "itertools 0.14.0", @@ -10340,9 +10668,9 @@ dependencies = [ name = "vortex-row" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-row", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", "bytes", "codspeed-divan-compat", "mimalloc", @@ -10361,8 +10689,8 @@ name = "vortex-runend" version = "0.1.0" dependencies = [ "arbitrary", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "itertools 0.14.0", "num-traits", @@ -10460,8 +10788,8 @@ dependencies = [ name = "vortex-tensor" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "half", "itertools 0.14.0", @@ -10484,8 +10812,8 @@ dependencies = [ name = "vortex-test-e2e-cuda" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "futures", "vortex", "vortex-cuda", @@ -10496,8 +10824,8 @@ name = "vortex-tui" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "clap", "console_error_panic_hook", "crossterm", @@ -10510,7 +10838,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "js-sys", - "parquet", + "parquet 58.3.0", "ratatui", "ratzilla", "serde", @@ -10537,9 +10865,9 @@ dependencies = [ name = "vortex-web-wasm" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-ipc", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "console_error_panic_hook", "futures", "js-sys", @@ -11083,6 +11411,18 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "wkt" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54f7f1ff4ea4c18936d6cd26a6fd24f0003af37e951a8e0e8b9e9a2d0bd0a46d" +dependencies = [ + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + [[package]] name = "wkt" version = "0.14.0" diff --git a/Cargo.toml b/Cargo.toml index 0a1cd5731e9..828171f3050 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -162,6 +162,7 @@ geo = "0.31.0" geo-traits = "0.3.0" geo-types = "0.7.19" geoarrow = "0.8.0" +geoarrow-cast = "0.8.0" get_dir = "0.5.0" glob = "0.3.2" goldenfile = "1" @@ -241,6 +242,14 @@ similar = "3.0.0" sketches-ddsketch = "0.4.0" smallvec = "1.15.1" smol = "2.0.2" +spatialbench = "0.2" +spatialbench-arrow = "0.2" +# spatialbench still pins arrow 56, two majors behind the workspace arrow. Until upstream +# catches up, write its generated batches with a matching parquet instead of converting +# arrow versions at the boundary. +spatialbench-parquet = { package = "parquet", version = "56", features = [ + "async", +] } static_assertions = "1.1" strum = "0.28" syn = { version = "2.0.117", features = ["full"] } diff --git a/benchmarks/duckdb-bench/src/lib.rs b/benchmarks/duckdb-bench/src/lib.rs index fed9f82b004..74f7a0cfa64 100644 --- a/benchmarks/duckdb-bench/src/lib.rs +++ b/benchmarks/duckdb-bench/src/lib.rs @@ -27,6 +27,9 @@ pub struct DuckClient { connection: Option, pub db_path: PathBuf, pub threads: Option, + /// Replayed on every (re)open, since extensions load per instance. Currently + /// `INSTALL spatial; LOAD spatial;` for SpatialBench. + init_sql: Vec, } impl DuckClient { @@ -68,9 +71,19 @@ impl DuckClient { connection: Some(connection), db_path, threads, + init_sql: Vec::new(), }) } + /// Run `statements` now and after every subsequent [`DuckClient::reopen`]. + pub fn set_init_sql(&mut self, statements: Vec) -> Result<()> { + for stmt in &statements { + self.connection().query(stmt)?; + } + self.init_sql = statements; + Ok(()) + } + pub fn open_and_setup_database( path: Option, threads: Option, @@ -118,6 +131,14 @@ impl DuckClient { self.db = Some(db); self.connection = Some(connection); + // Replay init SQL (e.g. LOAD spatial) — extensions are per-instance. + for stmt in &self.init_sql { + self.connection + .as_ref() + .vortex_expect("connection just opened") + .query(stmt)?; + } + Ok(()) } @@ -133,6 +154,7 @@ impl DuckClient { connection: Some(connection), db_path, threads: None, + init_sql: Vec::new(), }) } diff --git a/benchmarks/duckdb-bench/src/main.rs b/benchmarks/duckdb-bench/src/main.rs index 8ba4937f566..48accc99067 100644 --- a/benchmarks/duckdb-bench/src/main.rs +++ b/benchmarks/duckdb-bench/src/main.rs @@ -171,12 +171,13 @@ fn main() -> anyhow::Result<()> { &filtered_queries, mode, |format| { - let ctx = DuckClient::new( + let mut ctx = DuckClient::new( &*benchmark, format, args.delete_duckdb_database, args.threads, )?; + ctx.set_init_sql(benchmark.engine_init_sql(Engine::DuckDB))?; ctx.register_tables(&*benchmark, format)?; // Duckdb doesn't support octet_length for strings but we need this @@ -196,7 +197,10 @@ fn main() -> anyhow::Result<()> { if !args.reuse { ctx.reopen()?; } - ctx.execute_query_result(query) + // Adapt the query to how this format surfaces its columns (e.g. SpatialBench geometry: + // `GEOMETRY` for Vortex vs. WKB `BLOB` for Parquet). + let query = benchmark.query_for_format(query, format); + ctx.execute_query_result(&query) }, )?; diff --git a/vortex-array/src/aggregate_fn/plugin.rs b/vortex-array/src/aggregate_fn/plugin.rs index b7ff8b893ac..e14053f6a24 100644 --- a/vortex-array/src/aggregate_fn/plugin.rs +++ b/vortex-array/src/aggregate_fn/plugin.rs @@ -10,6 +10,7 @@ use crate::aggregate_fn::AggregateFn; use crate::aggregate_fn::AggregateFnId; use crate::aggregate_fn::AggregateFnRef; use crate::aggregate_fn::AggregateFnVTable; +use crate::dtype::DType; /// Reference-counted pointer to an aggregate function plugin. pub type AggregateFnPluginRef = Arc; @@ -28,6 +29,9 @@ pub trait AggregateFnPlugin: 'static + Send + Sync { /// Deserialize an aggregate function from serialized metadata. fn deserialize(&self, metadata: &[u8], session: &VortexSession) -> VortexResult; + + /// The default per-chunk zone statistic to store for a column of `input_dtype`, or `None` if this aggregate isn't one. + fn zone_stat_default(&self, input_dtype: &DType) -> Option; } impl std::fmt::Debug for dyn AggregateFnPlugin { @@ -51,4 +55,8 @@ impl AggregateFnPlugin for V { let options = AggregateFnVTable::deserialize(self, metadata, session)?; Ok(AggregateFn::new(self.clone(), options).erased()) } + + fn zone_stat_default(&self, input_dtype: &DType) -> Option { + AggregateFnVTable::zone_stat_default(self, input_dtype) + } } diff --git a/vortex-array/src/aggregate_fn/session.rs b/vortex-array/src/aggregate_fn/session.rs index b309fd02a10..0e4b9e563bb 100644 --- a/vortex-array/src/aggregate_fn/session.rs +++ b/vortex-array/src/aggregate_fn/session.rs @@ -9,6 +9,7 @@ use vortex_session::SessionVar; use crate::aggregate_fn::AggregateFnId; use crate::aggregate_fn::AggregateFnPluginRef; +use crate::aggregate_fn::AggregateFnRef; use crate::aggregate_fn::AggregateFnVTable; use crate::aggregate_fn::fns::all_nan::AllNan; use crate::aggregate_fn::fns::all_non_distinct::AllNonDistinct; @@ -43,6 +44,7 @@ use crate::arrays::chunked::compute::aggregate::ChunkedArrayAggregate; use crate::arrays::dict::compute::is_constant::DictIsConstantKernel; use crate::arrays::dict::compute::is_sorted::DictIsSortedKernel; use crate::arrays::dict::compute::min_max::DictMinMaxKernel; +use crate::dtype::DType; /// Session state for aggregate functions and encoding-specific aggregate kernels. /// @@ -133,6 +135,17 @@ impl AggregateFnSession { self.registry.insert(id, pluginref); } + /// The default per-chunk zone statistics for a column of `input_dtype`, collected from every + /// registered aggregate's `zone_stat_default`. + pub fn zone_stat_defaults(&self, input_dtype: &DType) -> Vec { + self.registry.read(|registry| { + registry + .values() + .filter_map(|plugin| plugin.zone_stat_default(input_dtype)) + .collect() + }) + } + /// Returns the aggregate kernel registered for `array_id` and `agg_fn_id`, if any. /// /// Lookup first checks for a kernel registered for the exact aggregate function, then falls diff --git a/vortex-array/src/aggregate_fn/vtable.rs b/vortex-array/src/aggregate_fn/vtable.rs index 49b28dd26d7..372a1287fce 100644 --- a/vortex-array/src/aggregate_fn/vtable.rs +++ b/vortex-array/src/aggregate_fn/vtable.rs @@ -93,6 +93,13 @@ pub trait AggregateFnVTable: 'static + Sized + Clone + Send + Sync { /// Returns `None` if the aggregate function cannot be applied to the input dtype. fn return_dtype(&self, options: &Self::Options, input_dtype: &DType) -> Option; + /// If this aggregate should be computed as a default zone statistic for `input_dtype`, return + /// the bound aggregate to store. Default: not a zone-map default. + fn zone_stat_default(&self, input_dtype: &DType) -> Option { + let _ = input_dtype; + None + } + /// DType of the intermediate partial accumulator state. /// /// Use a struct dtype when multiple fields are needed diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml index 3b793c6124a..0187bdb986e 100644 --- a/vortex-bench/Cargo.toml +++ b/vortex-bench/Cargo.toml @@ -23,6 +23,7 @@ vortex = { workspace = true, features = [ "tokio", "zstd", ] } +vortex-geo = { workspace = true } vortex-tensor = { workspace = true } # TODO(connor): In the future, this might be inside vortex. anyhow = { workspace = true } @@ -33,6 +34,8 @@ async-trait = { workspace = true } bzip2 = { workspace = true } clap = { workspace = true, features = ["derive"] } futures = { workspace = true } +geoarrow = { workspace = true } +geoarrow-cast = { workspace = true } get_dir = { workspace = true } glob = { workspace = true } humansize = { workspace = true } @@ -48,6 +51,9 @@ regex = { workspace = true } reqwest = { workspace = true, features = ["stream"] } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } +spatialbench = { workspace = true } +spatialbench-arrow = { workspace = true } +spatialbench-parquet = { workspace = true } sysinfo = { workspace = true } tabled = { workspace = true, features = ["std"] } target-lexicon = { workspace = true } diff --git a/vortex-bench/spatialbench.sql b/vortex-bench/spatialbench.sql new file mode 100644 index 00000000000..a3ed722416b --- /dev/null +++ b/vortex-bench/spatialbench.sql @@ -0,0 +1,229 @@ +-- SpatialBench queries (Apache Sedona), WKB dialect. See sedona-spatialbench/docs/queries.md. +-- Numbered from Q0 (= SpatialBench Q1). Only Q0 is wired up today, the rest are not run yet. + +-- Q0: Find trips starting within 50km of the Sedona city center, ranked by distance. +SELECT + t_tripkey, + ST_X(ST_GeomFromWKB(t_pickuploc)) AS pickup_lon, + ST_Y(ST_GeomFromWKB(t_pickuploc)) AS pickup_lat, + t_pickuptime, + ST_Distance(ST_GeomFromWKB(t_pickuploc), ST_Point(-111.7610::double, 34.8697::double)) AS distance_to_center +FROM trip +WHERE ST_Distance(ST_GeomFromWKB(t_pickuploc), ST_Point(-111.7610::double, 34.8697::double)) <= 0.45::double +ORDER BY distance_to_center ASC, t_tripkey ASC; + +-- Q1: Count trips starting within Coconino County (Arizona) zone. +SELECT COUNT(*) AS trip_count_in_coconino_county +FROM trip t +WHERE ST_Intersects( + ST_GeomFromWKB(t.t_pickuploc), + ( + SELECT ST_GeomFromWKB(z.z_boundary) + FROM zone z + WHERE z.z_name = 'Coconino County' + LIMIT 1 + ) +); + +-- Q2: Monthly trip statistics within a 15km radius of the Sedona city center. +SELECT + DATE_TRUNC('month', t.t_pickuptime) AS pickup_month, + COUNT(t.t_tripkey) AS total_trips, + AVG(t.t_distance) AS avg_distance, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration, + AVG(t.t_fare) AS avg_fare +FROM trip t +-- ST_DWithin(a, b, d) equals ST_Distance(a, b) <= d, written as the distance comparison so the +-- radius filter pushes into the scan (DuckDB stashes ST_DWithin's distance in bind_data, hiding it). +WHERE ST_Distance( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromText('POLYGON(( + -111.9060 34.7347, -111.6160 34.7347, + -111.6160 35.0047, -111.9060 35.0047, + -111.9060 34.7347 + ))') -- Bounding box around Sedona +) <= 0.045 -- 5km buffer in degrees +GROUP BY DATE_TRUNC('month', t.t_pickuptime) +ORDER BY pickup_month; + +-- Q3: Zone distribution of top 1000 trips by tip amount. +SELECT + z.z_zonekey, + z.z_name, + COUNT(*) AS trip_count +FROM + zone z + JOIN ( + SELECT t.t_pickuploc + FROM trip t + ORDER BY t.t_tip DESC, t.t_tripkey ASC + LIMIT 1000 + ) top_trips + ON ST_Within( + ST_GeomFromWKB(top_trips.t_pickuploc), + ST_GeomFromWKB(z.z_boundary) + ) +GROUP BY z.z_zonekey, z.z_name +ORDER BY trip_count DESC, z.z_zonekey ASC; + +-- Q4: Monthly travel patterns for repeat customers (convex hull of dropoff locations). +SELECT + c.c_custkey, + c.c_name AS customer_name, + DATE_TRUNC('month', t.t_pickuptime) AS pickup_month, + ST_Area( + ST_ConvexHull(ST_Collect(ST_GeomFromWKB(t.t_dropoffloc))) + ) AS monthly_travel_hull_area, + COUNT(*) as dropoff_count +FROM trip t +JOIN customer c + ON t.t_custkey = c.c_custkey +GROUP BY c.c_custkey, c.c_name, pickup_month +HAVING dropoff_count > 5 -- Only include repeat customers +ORDER BY monthly_travel_hull_area DESC, c.c_custkey ASC; + +-- Q5: Zone statistics for trips within a 50km radius of the Sedona city center. +SELECT + z.z_zonekey, + z.z_name, + COUNT(t.t_tripkey) AS total_pickups, + AVG(t.t_distance) AS avg_distance, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration +FROM trip t, zone z +WHERE ST_Intersects( + ST_GeomFromText('POLYGON(( + -112.2110 34.4197, -111.3110 34.4197, + -111.3110 35.3197, -112.2110 35.3197, + -112.2110 34.4197 + ))'), -- Bounding box around Sedona + ST_GeomFromWKB(z.z_boundary) + ) + AND ST_Within( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromWKB(z.z_boundary) + ) +GROUP BY z.z_zonekey, z.z_name +ORDER BY total_pickups DESC, z.z_zonekey ASC; + +-- Q6: Detect potential route detours by comparing reported vs. geometric distances. +WITH trip_lengths AS ( + SELECT + t.t_tripkey, + t.t_distance AS reported_distance_m, + ST_Length( + ST_MakeLine( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromWKB(t.t_dropoffloc) + ) + ) * 111111 AS line_distance_m -- Approx. meters per degree + FROM trip t +) +SELECT + t.t_tripkey, + t.reported_distance_m, + t.line_distance_m, + t.reported_distance_m / NULLIF(t.line_distance_m, 0) AS detour_ratio +FROM trip_lengths t +ORDER BY + detour_ratio DESC NULLS LAST, + reported_distance_m DESC, + t_tripkey ASC; + +-- Q7: Count nearby pickups for each building within a 500m radius. +SELECT b.b_buildingkey, b.b_name, COUNT(*) AS nearby_pickup_count +FROM trip t +JOIN building b +ON ST_DWithin(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(b.b_boundary), 0.0045) -- ~500m +GROUP BY b.b_buildingkey, b.b_name +ORDER BY nearby_pickup_count DESC, b.b_buildingkey ASC; + +-- Q8: Building conflation (duplicate/overlap detection via IoU). +WITH b1 AS ( + SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom + FROM building +), +b2 AS ( + SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom + FROM building +), +pairs AS ( + SELECT + b1.id AS building_1, + b2.id AS building_2, + ST_Area(b1.geom) AS area1, + ST_Area(b2.geom) AS area2, + ST_Area(ST_Intersection(b1.geom, b2.geom)) AS overlap_area + FROM b1 + JOIN b2 ON b1.id < b2.id AND ST_Intersects(b1.geom, b2.geom) +) +SELECT + building_1, + building_2, + area1, + area2, + overlap_area, + CASE + WHEN (area1 + area2 - overlap_area) = 0 THEN 1.0 + ELSE overlap_area / (area1 + area2 - overlap_area) + END AS iou +FROM pairs +ORDER BY iou DESC, building_1 ASC, building_2 ASC; + +-- Q9: Zone statistics for trips starting within each zone. +SELECT + z.z_zonekey, + z.z_name AS pickup_zone, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration, + AVG(t.t_distance) AS avg_distance, + COUNT(t.t_tripkey) AS num_trips +FROM + zone z + LEFT JOIN trip t + ON ST_Within( + ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(z.z_boundary) + ) +GROUP BY z.z_zonekey, z.z_name +ORDER BY avg_duration DESC NULLS LAST, z.z_zonekey ASC; + +-- Q10: Count trips that cross between different zones. +SELECT COUNT(*) AS cross_zone_trip_count +FROM + trip t + JOIN zone pickup_zone + ON ST_Within( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromWKB(pickup_zone.z_boundary) + ) + JOIN zone dropoff_zone + ON ST_Within( + ST_GeomFromWKB(t.t_dropoffloc), + ST_GeomFromWKB(dropoff_zone.z_boundary) + ) +WHERE pickup_zone.z_zonekey != dropoff_zone.z_zonekey; + +-- Q11: Find five nearest buildings to each trip pickup location using KNN join. +WITH trip_with_geom AS ( + SELECT + t_tripkey, + t_pickuploc, + ST_GeomFromWKB(t_pickuploc) as pickup_geom + FROM trip +), +building_with_geom AS ( + SELECT + b_buildingkey, + b_name, + b_boundary, + ST_GeomFromWKB(b_boundary) as boundary_geom + FROM building +) +SELECT + t.t_tripkey, + t.t_pickuploc, + b.b_buildingkey, + b.b_name AS building_name, + ST_Distance(t.pickup_geom, b.boundary_geom) AS distance_to_building +FROM trip_with_geom t +JOIN building_with_geom b + ON ST_KNN(t.pickup_geom, b.boundary_geom, 5, FALSE) +ORDER BY t.t_tripkey ASC, distance_to_building ASC, b.b_buildingkey ASC; diff --git a/vortex-bench/src/benchmark.rs b/vortex-bench/src/benchmark.rs index 2872a02aa64..ddf728f8125 100644 --- a/vortex-bench/src/benchmark.rs +++ b/vortex-bench/src/benchmark.rs @@ -8,6 +8,7 @@ use glob::Pattern; use url::Url; use crate::BenchmarkDataset; +use crate::Engine; use crate::Format; /// Specification for a table in a benchmark dataset. @@ -32,6 +33,24 @@ pub trait Benchmark: Send + Sync { /// Get all available queries for this benchmark fn queries(&self) -> anyhow::Result>; + /// Adapt a query to a specific storage `format` before execution. Default: unchanged. + /// + /// Used when the same logical query must be phrased differently depending on how a format + /// surfaces its columns to the engine — e.g. SpatialBench geometry reads back as native + /// `GEOMETRY` from Vortex but as a WKB `BLOB` from Parquet, so the `ST_GeomFromWKB(..)` wrappers + /// are stripped for one and kept for the other. + fn query_for_format(&self, query: &str, format: Format) -> String { + let _ = format; + query.to_string() + } + + /// SQL an `engine` must run before this benchmark's queries (e.g. loading engine + /// extensions). Runners replay these after every (re)open. Default: none. + fn engine_init_sql(&self, engine: Engine) -> Vec { + let _ = engine; + Vec::new() + } + /// Generate or prepare base data for the benchmark (typically Parquet format). /// This is the canonical source data that can be converted to other formats. /// This should be idempotent - safe to call multiple times. diff --git a/vortex-bench/src/datasets/mod.rs b/vortex-bench/src/datasets/mod.rs index 3e72ba69e7f..353efbb4aea 100644 --- a/vortex-bench/src/datasets/mod.rs +++ b/vortex-bench/src/datasets/mod.rs @@ -69,6 +69,11 @@ pub enum BenchmarkDataset { ClickBench { flavor: Flavor }, #[serde(rename = "public-bi")] PublicBi { name: String }, + #[serde(rename = "spatialbench")] + SpatialBench { + scale_factor: String, + native_points: bool, + }, #[serde(rename = "statpopgen")] StatPopGen { n_rows: u64 }, #[serde(rename = "polarsignals")] @@ -87,6 +92,7 @@ impl BenchmarkDataset { BenchmarkDataset::TpcDS { .. } => "tpcds", BenchmarkDataset::ClickBench { .. } => "clickbench", BenchmarkDataset::PublicBi { .. } => "public-bi", + BenchmarkDataset::SpatialBench { .. } => "spatialbench", BenchmarkDataset::StatPopGen { .. } => "statpopgen", BenchmarkDataset::PolarSignals { .. } => "polarsignals", BenchmarkDataset::Fineweb => "fineweb", @@ -106,6 +112,17 @@ impl Display for BenchmarkDataset { Flavor::Single => write!(f, "clickbench-single"), }, BenchmarkDataset::PublicBi { name } => write!(f, "public-bi({name})"), + BenchmarkDataset::SpatialBench { + scale_factor, + native_points, + } => { + let points = if *native_points { + ", points=native" + } else { + "" + }; + write!(f, "spatialbench(sf={scale_factor}{points})") + } BenchmarkDataset::StatPopGen { n_rows } => write!(f, "statpopgen(n_rows={n_rows})"), BenchmarkDataset::PolarSignals { n_rows } => { write!(f, "polarsignals(n_rows={n_rows})") @@ -163,6 +180,7 @@ impl BenchmarkDataset { "supplier", ], BenchmarkDataset::ClickBench { .. } | BenchmarkDataset::PublicBi { .. } => todo!(), + BenchmarkDataset::SpatialBench { .. } => &["trip", "building", "zone"], BenchmarkDataset::StatPopGen { .. } => &["statpopgen"], BenchmarkDataset::PolarSignals { .. } => &["stacktraces"], BenchmarkDataset::Fineweb => &["fineweb"], diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 30ff45c97a8..b131906d85f 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -34,6 +34,8 @@ use vortex::file::VortexWriteOptions; use vortex::file::WriteStrategyBuilder; use vortex::utils::aliases::hash_map::HashMap; +use crate::spatialbench::SpatialBenchBenchmark; + pub mod appian; pub mod benchmark; pub mod clickbench; @@ -51,6 +53,7 @@ pub mod public_bi; pub mod random_access; pub mod realnest; pub mod runner; +pub mod spatialbench; pub mod statpopgen; pub mod tpcds; pub mod tpch; @@ -72,8 +75,11 @@ use vortex::session::VortexSession; #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; -pub static SESSION: LazyLock = - LazyLock::new(|| VortexSession::default().with_tokio()); +pub static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::default().with_tokio(); + vortex_geo::initialize(&session); + session +}); #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] pub struct Target { @@ -265,6 +271,8 @@ pub enum BenchmarkArg { PolarSignals, #[clap(name = "public-bi")] PublicBi, + #[clap(name = "spatialbench")] + SpatialBench, } /// Default scale factor for TPC-related benchmarks @@ -326,6 +334,21 @@ pub fn create_benchmark(b: BenchmarkArg, opts: &Opts) -> anyhow::Result { + let scale_factor = opts.get(SCALE_FACTOR_KEY).unwrap_or(DEFAULT_SCALE_FACTOR); + let remote_data_dir = opts.get_as::(REMOTE_DATA_KEY); + let native_points = match opts.get("points") { + None | Some("wkb") => false, + Some("native") => true, + Some(other) => bail!("unknown points option {other:?}, expected wkb or native"), + }; + let benchmark = SpatialBenchBenchmark::new( + scale_factor.to_string(), + remote_data_dir, + native_points, + )?; + Ok(Box::new(benchmark) as _) + } } } diff --git a/vortex-bench/src/spatialbench/benchmark.rs b/vortex-bench/src/spatialbench/benchmark.rs new file mode 100644 index 00000000000..0429b211a62 --- /dev/null +++ b/vortex-bench/src/spatialbench/benchmark.rs @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench benchmark implementation + +use std::fs; + +use url::Url; + +use crate::Benchmark; +use crate::BenchmarkDataset; +use crate::Engine; +use crate::Format; +use crate::TableSpec; +use crate::spatialbench::datagen; +use crate::spatialbench::datagen::Table; +use crate::utils::file::resolve_data_url; +use crate::workspace_root; + +/// Data-dir subfolder for the native-geometry Vortex files (`points=native`). +pub const NATIVE_DIR: &str = "vortex-native"; + +/// Data-dir subfolder for the native-geometry GeoParquet files (`points=native`). +pub const PARQUET_NATIVE_DIR: &str = "parquet-native"; + +/// Queries wired up to run (0-based, `spatialbench.sql` order): Q0 (radius filter), Q1 (zone +/// point-in-polygon), Q2 (point-to-polygon radius, rewritten from `ST_DWithin`), Q5 (zone stats in a +/// Sedona bounding box — its `ST_Intersects(const, z_boundary)` filter pushes into the `zone` scan; +/// the `ST_Within` half stays a DuckDB spatial join), and Q7 (building join). Q1/Q5 need the +/// externally-sourced `zone` table. The file holds the full suite; the rest need tables/functions not +/// wired yet. +const SUPPORTED_QUERIES: &[usize] = &[0, 1, 2, 5, 7]; + +/// SpatialBench geospatial benchmark (Apache Sedona): a `trip` point table and `building` polygons, +/// queried with spatial filters and joins. See . +pub struct SpatialBenchBenchmark { + pub scale_factor: String, + pub data_url: Url, + /// `--opt points=native`: store geometry as native `Point`/`Polygon` (not WKB) and read the + /// native data dirs. The query dialect is chosen per format in [`Self::query_for_format`], not + /// by this flag. + pub native_points: bool, +} + +impl SpatialBenchBenchmark { + pub fn new( + scale_factor: String, + use_remote_data_dir: Option, + native_points: bool, + ) -> anyhow::Result { + Ok(Self { + data_url: resolve_data_url( + use_remote_data_dir.as_deref(), + &format!("spatialbench/{scale_factor}"), + )?, + scale_factor, + native_points, + }) + } +} + +#[async_trait::async_trait] +impl Benchmark for SpatialBenchBenchmark { + fn queries(&self) -> anyhow::Result> { + // The file is the WKB dialect (`ST_GeomFromWKB(..)`). The dialect is adapted per format in + // `query_for_format` — not by `points=` — since whether geometry reads back as `GEOMETRY` or + // `BLOB` depends on the format, not the storage encoding. Statements are `;`-separated, + // numbered 0-based in file order; only `SUPPORTED_QUERIES` run. + let queries_file = workspace_root() + .join("vortex-bench") + .join("spatialbench") + .with_extension("sql"); + let contents = fs::read_to_string(queries_file)?; + Ok(contents + .split_terminator(';') + .map(str::trim) + .map(str::to_string) + .enumerate() + .filter(|(idx, _)| SUPPORTED_QUERIES.contains(idx)) + .collect()) + } + + /// Only `points=native` Vortex surfaces geometry as `GEOMETRY`, so it drops the + /// `ST_GeomFromWKB(..)` wrappers. WKB-stored Vortex (and Parquet) read as `BLOB` and keep them. + fn query_for_format(&self, query: &str, format: Format) -> String { + match format { + Format::OnDiskVortex if self.native_points => strip_wkb_wrappers(query), + _ => query.to_string(), + } + } + + async fn generate_base_data(&self) -> anyhow::Result<()> { + if self.data_url.scheme() != "file" { + return Ok(()); + } + + let base_data_dir = self + .data_url + .to_file_path() + .map_err(|_| anyhow::anyhow!("Invalid file URL: {}", self.data_url.as_str()))?; + + datagen::generate_tables(&self.scale_factor, base_data_dir.clone()).await?; + + if self.native_points { + let parquet_dir = base_data_dir.join(Format::Parquet.name()); + let native_dir = base_data_dir.join(NATIVE_DIR); + let parquet_native_dir = base_data_dir.join(PARQUET_NATIVE_DIR); + // Natively encode every table with geometry columns (trip Point, building/zone Polygon). + // `zone` is sourced externally (`spatialbench-cli`), so only convert it once its parquet + // is present. + let mut tables = vec![Table::Trip, Table::Building]; + if zone_parquet_present(&parquet_dir) { + tables.push(Table::Zone); + } + for table in tables { + datagen::write_native_vortex(table, &parquet_dir, &native_dir).await?; + datagen::write_native_parquet(table, &parquet_dir, &parquet_native_dir).await?; + } + } + Ok(()) + } + + fn format_path(&self, format: Format, base_url: &Url) -> anyhow::Result { + if self.native_points { + // points=native reads the native-geometry dirs (Vortex / GeoParquet); other formats + // would feed WKB to the stripped SQL, so bail. + let dir = match format { + Format::OnDiskVortex => NATIVE_DIR, + Format::Parquet => PARQUET_NATIVE_DIR, + other => anyhow::bail!( + "points=native only supports the vortex and parquet formats, got {other}" + ), + }; + return Ok(base_url.join(&format!("{dir}/"))?); + } + Ok(base_url.join(&format!("{}/", format.name()))?) + } + + fn expected_row_counts(&self) -> Option> { + // Q0 result count by scale factor (index 0), cross-checked against a brute-force WKB decode. + match self.scale_factor.as_str() { + "0.1" => Some(vec![6]), + "1.0" => Some(vec![94]), + "3.0" => Some(vec![267]), + _ => None, + } + } + + fn dataset(&self) -> BenchmarkDataset { + BenchmarkDataset::SpatialBench { + scale_factor: self.scale_factor.clone(), + native_points: self.native_points, + } + } + + fn dataset_name(&self) -> &str { + "spatialbench" + } + + fn dataset_display(&self) -> String { + format!("spatialbench(sf={})", self.scale_factor) + } + + fn data_url(&self) -> &Url { + &self.data_url + } + + fn table_specs(&self) -> Vec { + let mut specs = vec![TableSpec::new("trip", None), TableSpec::new("building", None)]; + // `zone` is externally sourced and optional; register it only when present so Q0/Q7 (which + // don't need it) don't fail on the missing glob. + let zone_present = match self.data_url.to_file_path() { + Ok(base) => zone_parquet_present(&base.join(Format::Parquet.name())), + Err(()) => true, + }; + if zone_present { + specs.push(TableSpec::new("zone", None)); + } + specs + } + + /// Scope each table to its own `{table}_*.{ext}` files; the default globs every file in the + /// format dir, conflating the `trip` and `building` schemas. + fn pattern(&self, table_name: &str, format: Format) -> Option { + Some( + format!("{}_*.{}", table_name, format.ext()) + .parse() + .expect("valid glob pattern"), + ) + } + + /// DuckDB needs the `spatial` extension for `ST_*`; the runner replays it on each (re)open. + /// First INSTALL needs network. + fn engine_init_sql(&self, engine: Engine) -> Vec { + match engine { + Engine::DuckDB => vec!["INSTALL spatial;".to_string(), "LOAD spatial;".to_string()], + _ => Vec::new(), + } + } +} + +/// Whether an externally-sourced `zone_*.parquet` exists under `parquet_dir` (generated by the +/// upstream `spatialbench-cli`; see the module docs). Native conversion of `zone` is skipped until +/// it is present, so Q0/Q7 runs don't require it. +fn zone_parquet_present(parquet_dir: &std::path::Path) -> bool { + glob::glob(&parquet_dir.join("zone_*.parquet").to_string_lossy()) + .map(|mut paths| paths.next().is_some()) + .unwrap_or(false) +} + +/// Drop each `ST_GeomFromWKB(col)` wrapper down to `col`: native columns are already geometries. +fn strip_wkb_wrappers(sql: &str) -> String { + const OPEN: &str = "ST_GeomFromWKB("; + let mut out = String::with_capacity(sql.len()); + let mut rest = sql; + while let Some(pos) = rest.find(OPEN) { + out.push_str(&rest[..pos]); + let after = &rest[pos + OPEN.len()..]; + match after.find(')') { + Some(close) => { + out.push_str(&after[..close]); + rest = &after[close + 1..]; + } + // Unbalanced wrapper: emit it verbatim and stop rewriting. + None => { + out.push_str(OPEN); + rest = after; + } + } + } + out.push_str(rest); + out +} diff --git a/vortex-bench/src/spatialbench/datagen/mod.rs b/vortex-bench/src/spatialbench/datagen/mod.rs new file mode 100644 index 00000000000..671b87663a4 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/mod.rs @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench data preparation. [`wkb`] generates the canonical WKB base tables; [`native`] +//! derives the native-Point encodings from them for `points=native`. The [`table`] catalog is the +//! single source of truth for the base tables both stages share. + +pub mod native; +pub mod table; +pub mod wkb; + +pub use native::write_native_parquet; +pub use native::write_native_vortex; +pub use table::Table; +pub use wkb::generate_tables; diff --git a/vortex-bench/src/spatialbench/datagen/native.rs b/vortex-bench/src/spatialbench/datagen/native.rs new file mode 100644 index 00000000000..6566f61c934 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/native.rs @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Native-geometry preparation for `points=native`: decode each table's WKB geometry columns to +//! native GeoArrow types in Arrow land (`geoarrow_cast`, so Vortex never decodes WKB), then write +//! them as a native Vortex file and a GeoParquet file. The decode is a one-time data-prep cost. + +use std::path::Path; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Context; +use arrow_array::RecordBatch; +use arrow_schema::DataType; +use arrow_schema::Schema; +use futures::TryStreamExt; +use geoarrow::array::GenericWkbArray; +use geoarrow::array::GeoArrowArray; +use geoarrow::array::WkbViewArray; +use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::Crs; +use geoarrow::datatypes::Dimension; +use geoarrow::datatypes::GeoArrowType; +use geoarrow::datatypes::GeometryType; +use geoarrow::datatypes::Metadata; +use geoarrow::datatypes::PointType; +use geoarrow::datatypes::PolygonType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; +use parquet::arrow::AsyncArrowWriter; +use parquet::arrow::ParquetRecordBatchStreamBuilder; +use parquet::arrow::ProjectionMask; +use parquet::basic::Compression; +use parquet::file::properties::WriterProperties; +use tokio::fs::File as TokioFile; +use vortex::array::ArrayRef; +use vortex::array::IntoArray; +use vortex::array::arrays::ChunkedArray; +use vortex::array::arrow::ArrowSessionExt; +use vortex::file::WriteOptionsSessionExt; + +use super::table::GeometryKind; +use super::table::Table; +use crate::SESSION; +use crate::utils::file::idempotent_async; + +/// EPSG:4326, the CRS the benchmark data and queries assume. +fn epsg_4326() -> Arc { + Arc::new(Metadata::new( + Crs::from_unknown_crs_type("EPSG:4326".to_string()), + None, + )) +} + +/// Write `{native_dir}/{table}_0.vortex` with native geometry columns from the WKB parquet. Idempotent. +pub async fn write_native_vortex( + table: Table, + parquet_dir: &Path, + native_dir: &Path, +) -> anyhow::Result { + idempotent_async( + native_dir.join(format!("{}_0.vortex", table.name())), + |path| async move { + let chunks = map_source_batches(parquet_dir, table, |b| native_chunk(b, table)).await?; + + let dtype = chunks[0].dtype().clone(); + let chunked = ChunkedArray::try_new(chunks, dtype)?.into_array(); + let mut file = TokioFile::create(&path).await?; + SESSION + .write_options() + .write(&mut file, chunked.to_array_stream()) + .await?; + tracing::info!(path = %path.display(), table = table.name(), "wrote native geometry table"); + Ok(()) + }, + ) + .await +} + +/// Write `{out_dir}/{table}_0.parquet` with native GeoArrow geometry columns (separated XY, +/// `geoarrow.*` field metadata). Idempotent. +pub async fn write_native_parquet( + table: Table, + parquet_dir: &Path, + out_dir: &Path, +) -> anyhow::Result { + idempotent_async( + out_dir.join(format!("{}_0.parquet", table.name())), + |path| async move { + let batches = + map_source_batches(parquet_dir, table, |b| native_record_batch(b, table)).await?; + + let schema = batches.first().context("no batches to write")?.schema(); + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let mut writer = + AsyncArrowWriter::try_new(TokioFile::create(&path).await?, schema, Some(props))?; + for batch in &batches { + writer.write(batch).await?; + } + writer.close().await?; + tracing::info!(path = %path.display(), table = table.name(), "wrote native geometry parquet table"); + Ok(()) + }, + ) + .await +} + +/// Apply `f` to every batch read from `table`'s base WKB parquet parts, projected to its columns. +async fn map_source_batches( + parquet_dir: &Path, + table: Table, + mut f: impl FnMut(RecordBatch) -> anyhow::Result, +) -> anyhow::Result> { + let pattern = parquet_dir.join(format!("{}_*.parquet", table.name())); + let mut files: Vec = + glob::glob(&pattern.to_string_lossy())?.collect::>()?; + files.sort(); + anyhow::ensure!(!files.is_empty(), "no parquet matching {pattern:?}"); + + let mut out = Vec::new(); + for file in files { + let builder = ParquetRecordBatchStreamBuilder::new(TokioFile::open(&file).await?).await?; + let mask = + ProjectionMask::columns(builder.parquet_schema(), table.columns().iter().copied()); + let mut stream = builder.with_projection(mask).build()?; + while let Some(batch) = stream.try_next().await? { + out.push(f(batch)?); + } + } + Ok(out) +} + +/// Convert each of `table`'s WKB geometry columns to its native-lane representation, swapping the +/// column in so the field carries the matching `geoarrow.*` extension metadata. +fn native_record_batch(batch: RecordBatch, table: Table) -> anyhow::Result { + let schema = batch.schema(); + let mut fields = schema.fields().to_vec(); + let mut columns = batch.columns().to_vec(); + + for geom in table.geometry_columns() { + let idx = schema.index_of(geom.name)?; + let column = batch.column(idx).as_ref(); + let wkb_type = WkbType::new(epsg_4326()); + + // Wrap the source WKB bytes in the matching GeoArrow array. SpatialBench's own tables emit + // `Binary`; the externally-sourced `zone` parquet uses `BinaryView`. + let wkb: Box = match column.data_type() { + DataType::Binary => Box::new(GenericWkbArray::::try_from((column, wkb_type))?), + DataType::LargeBinary => { + Box::new(GenericWkbArray::::try_from((column, wkb_type))?) + } + DataType::BinaryView => Box::new(WkbViewArray::try_from((column, wkb_type))?), + other => anyhow::bail!("{}: unsupported WKB column type {other}", geom.name), + }; + + // Produce the native-lane array for this column's geometry kind. + let native: Arc = match geom.kind { + // Homogeneous columns decode to a native, separated-XY GeoArrow type. + GeometryKind::Point => cast( + wkb.as_ref(), + &GeoArrowType::Point( + PointType::new(Dimension::XY, epsg_4326()) + .with_coord_type(CoordType::Separated), + ), + )?, + GeometryKind::Polygon => cast( + wkb.as_ref(), + &GeoArrowType::Polygon( + PolygonType::new(Dimension::XY, epsg_4326()) + .with_coord_type(CoordType::Separated), + ), + )?, + // A mixed Polygon/MultiPolygon column (Overture zones) has no native Vortex type, so it + // stays WKB — but re-encoded little-endian: Overture ships big-endian and DuckDB's direct + // GEOMETRY ingestion only accepts little-endian. Round-tripping through a mixed geometry + // array re-serializes it as little-endian WKB. + GeometryKind::Wkb => { + let mixed = cast( + wkb.as_ref(), + &GeoArrowType::Geometry(GeometryType::new(epsg_4326())), + )?; + cast( + mixed.as_ref(), + &GeoArrowType::Wkb(WkbType::new(epsg_4326())), + )? + } + }; + + columns[idx] = native.to_array_ref(); + fields[idx] = Arc::new(native.data_type().to_field(geom.name, false)); + } + + Ok(RecordBatch::try_new( + Arc::new(Schema::new(fields)), + columns, + )?) +} + +/// Convert a WKB batch to a Vortex struct chunk with `table`'s geometry columns as native types. +fn native_chunk(batch: RecordBatch, table: Table) -> anyhow::Result { + let native_batch = native_record_batch(batch, table)?; + let native_schema = native_batch.schema(); + SESSION + .arrow() + .from_arrow_record_batch(native_batch, &native_schema) + .context("importing native batch") +} diff --git a/vortex-bench/src/spatialbench/datagen/table.rs b/vortex-bench/src/spatialbench/datagen/table.rs new file mode 100644 index 00000000000..e769bf29146 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/table.rs @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The shared SpatialBench table catalog: one source of truth for the base tables, used by both +//! the WKB generation ([`super::wkb`]) and the native geometry conversion ([`super::native`]). + +/// A SpatialBench base table. +#[derive(Clone, Copy)] +pub enum Table { + Trip, + Building, + Zone, +} + +/// Base tables generated in-process from the scale factor. `Zone` is excluded — it is sourced externally. +pub(crate) const TABLES: &[Table] = &[Table::Trip, Table::Building]; + +/// A geometry column and the geometry type its WKB bytes decode to. +pub(crate) struct GeometryColumn { + pub(crate) name: &'static str, + pub(crate) kind: GeometryKind, +} + +/// Geometry types a column can hold. Add a variant (and the matching arm in [`super::native`]) as +/// tables with new geometry types are wired. +#[derive(Clone, Copy, Debug)] +pub(crate) enum GeometryKind { + Point, + Polygon, + /// No native vortex type yet — keep the WKB bytes as `WellKnownBinary` (still surfaces to DuckDB + /// as `GEOMETRY`). Used for `zone`, whose Overture boundaries include `MultiPolygon`. + Wkb, +} + +impl Table { + /// File stem under a format directory, e.g. `Trip` → `trip_{part}.parquet`. + pub(crate) fn name(self) -> &'static str { + match self { + Table::Trip => "trip", + Table::Building => "building", + Table::Zone => "zone", + } + } + + /// Columns the wired queries read — the projection applied when building native files. + pub(crate) fn columns(self) -> &'static [&'static str] { + match self { + Table::Trip => &[ + "t_tripkey", + "t_pickuptime", + "t_pickuploc", + "t_dropofftime", + "t_distance", + "t_fare", + ], + Table::Building => &["b_buildingkey", "b_name", "b_boundary"], + Table::Zone => &["z_zonekey", "z_name", "z_boundary"], + } + } + + /// Geometry columns to decode from WKB to native, with their geometry type. Empty for tables + /// only used on the WKB lane (DuckDB reads WKB directly; no native conversion needed yet). + pub(crate) fn geometry_columns(self) -> &'static [GeometryColumn] { + match self { + Table::Trip => &[GeometryColumn { + name: "t_pickuploc", + kind: GeometryKind::Point, + }], + Table::Building => &[GeometryColumn { + name: "b_boundary", + kind: GeometryKind::Polygon, + }], + Table::Zone => &[GeometryColumn { + name: "z_boundary", + kind: GeometryKind::Wkb, + }], + } + } +} diff --git a/vortex-bench/src/spatialbench/datagen/wkb.rs b/vortex-bench/src/spatialbench/datagen/wkb.rs new file mode 100644 index 00000000000..8b4edb65d8d --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/wkb.rs @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench WKB base-table generation via the `spatialbench` crates (a tpchgen-rs fork). +//! Geometry is emitted as WKB; the native-Point encodings derive from these files in +//! [`super::native`]. + +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Result; +// spatialbench emits arrow-56 batches, so they must be written with its matching arrow-56 +// parquet crate, not the workspace's arrow-58 one. The parquet file itself is version-neutral. +use spatialbench::generators::BuildingGenerator; +use spatialbench::generators::TripGenerator; +use spatialbench_arrow::BuildingArrow; +use spatialbench_arrow::RecordBatchIterator; +use spatialbench_arrow::TripArrow; +use spatialbench_parquet::arrow::AsyncArrowWriter; +use spatialbench_parquet::basic::Compression; +use spatialbench_parquet::file::properties::WriterProperties; +use tokio::fs::File as TokioFile; +use tracing::info; + +use super::table::TABLES; +use super::table::Table; +use crate::Format; +use crate::utils::file::idempotent_async; + +/// Batch size matching the TPC-H generator's streaming batches. +const BATCH_SIZE: usize = 8192 * 64; + +/// Batch iterator for one partition of `table`, from the arrow-56 `spatialbench` crates. +fn iterator( + table: Table, + scale_factor: f64, + part: i32, + part_count: i32, +) -> Box { + match table { + Table::Trip => Box::new( + TripArrow::new(TripGenerator::new(scale_factor, part, part_count)) + .with_batch_size(BATCH_SIZE), + ), + Table::Building => Box::new( + BuildingArrow::new(BuildingGenerator::new(scale_factor, part, part_count)) + .with_batch_size(BATCH_SIZE), + ), + // Zone is sourced externally (the published `spatialbench` crate has no generator); it is + // never emitted by `generate_tables`, which only iterates `TABLES`. + Table::Zone => unreachable!("zone is sourced externally, not generated in-process"), + } +} + +/// Generate the SpatialBench base tables as parquet under `{output_dir}/parquet/`. +pub async fn generate_tables(scale_factor: &str, output_dir: PathBuf) -> Result<()> { + let scale_factor = scale_factor.parse::()?; + let parquet_dir = output_dir.join(Format::Parquet.name()); + fs::create_dir_all(&parquet_dir)?; + + // One part per unit of scale factor keeps each file near the ~350MB the trip generator + // produces at SF1. + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let num_parts = (scale_factor.ceil() as usize).max(1); + let part_count = i32::try_from(num_parts)?; + + for &table in TABLES { + for part_idx in 0..num_parts { + let output_file = parquet_dir.join(format!("{}_{part_idx}.parquet", table.name())); + let part = i32::try_from(part_idx + 1)?; + + idempotent_async(output_file.to_string_lossy().as_ref(), |path| async move { + info!( + scale_factor, + part, + part_count, + table = table.name(), + "Generating SpatialBench table" + ); + + let iter = iterator(table, scale_factor, part, part_count); + let schema = Arc::clone(iter.schema()); + + let file = TokioFile::create(&path).await?; + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let mut writer = AsyncArrowWriter::try_new(file, schema, Some(props))?; + for batch in iter { + writer.write(&batch).await?; + } + writer.close().await?; + + Ok::<(), anyhow::Error>(()) + }) + .await?; + } + } + + Ok(()) +} diff --git a/vortex-bench/src/spatialbench/mod.rs b/vortex-bench/src/spatialbench/mod.rs new file mode 100644 index 00000000000..bba06bd7ef9 --- /dev/null +++ b/vortex-bench/src/spatialbench/mod.rs @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench geospatial analytics benchmark. +//! +//! See . + +pub mod benchmark; +pub mod datagen; + +pub use benchmark::SpatialBenchBenchmark; diff --git a/vortex-bench/src/v3.rs b/vortex-bench/src/v3.rs index 48e8a7f1c94..7ac1d72365b 100644 --- a/vortex-bench/src/v3.rs +++ b/vortex-bench/src/v3.rs @@ -294,6 +294,7 @@ fn canonical_tpc_scale_factor(scale_factor: &str) -> String { /// | `GhArchive` | `gharchive` | `None` | `None` | | /// | `Appian` | `appian` | `None` | `None` | Static dataset; no scale factor. | /// | `PublicBi { name }` | `public-bi` | dataset name (e.g. `cms-provider`) | `None` | Sub-dataset name lives in `dataset_variant`. | +/// | `SpatialBench { scale_factor, native_points }` | `spatialbench` | `points-native` when native, else `None` | SF as string | Same canonicalization as TPC-H; no historical v2 records to merge with. | pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, Option) { match d { BenchmarkDataset::TpcH { scale_factor } => ( @@ -318,6 +319,14 @@ pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, // live). Drop it to keep live ingests merging into the migrated // group. The dataset-level `n_rows` is recoverable from the bench // matrix if ever needed. + BenchmarkDataset::SpatialBench { + scale_factor, + native_points, + } => ( + "spatialbench".to_string(), + native_points.then(|| "points-native".to_string()), + Some(canonical_tpc_scale_factor(scale_factor)), + ), BenchmarkDataset::StatPopGen { .. } => ("statpopgen".to_string(), None, None), BenchmarkDataset::PolarSignals { .. } => ("polarsignals".to_string(), None, None), BenchmarkDataset::Fineweb => ("fineweb".to_string(), None, None), diff --git a/vortex-duckdb/src/convert/dtype.rs b/vortex-duckdb/src/convert/dtype.rs index 4238b354182..b2bb9a5e962 100644 --- a/vortex-duckdb/src/convert/dtype.rs +++ b/vortex-duckdb/src/convert/dtype.rs @@ -58,6 +58,8 @@ use vortex::extension::datetime::Time; use vortex::extension::datetime::TimeUnit; use vortex::extension::datetime::Timestamp; use vortex_geo::extension::GeoMetadata; +use vortex_geo::extension::Point; +use vortex_geo::extension::Polygon; use vortex_geo::extension::WellKnownBinary; use crate::cpp::DUCKDB_TYPE; @@ -245,9 +247,14 @@ impl TryFrom<&DType> for LogicalType { return temporal_to_duckdb(temporal); } - if let Some(wkb) = ext_dtype.metadata_opt::() { - let crs = wkb.crs.as_ref(); - return LogicalType::geometry_type(crs.map(|crs| crs.as_str())); + // Native Point/Polygon and WKB all surface to DuckDB as GEOMETRY so `ST_*` bind; for + // native geometry the filter work then pushes down into the Vortex scan. + if let Some(geo) = ext_dtype + .metadata_opt::() + .or_else(|| ext_dtype.metadata_opt::()) + .or_else(|| ext_dtype.metadata_opt::()) + { + return LogicalType::geometry_type(geo.crs.as_deref()); } vortex_bail!("Unsupported extension type \"{}\"", ext_dtype.id()); diff --git a/vortex-duckdb/src/convert/expr.rs b/vortex-duckdb/src/convert/expr.rs index 324086e5775..1f04d670ac6 100644 --- a/vortex-duckdb/src/convert/expr.rs +++ b/vortex-duckdb/src/convert/expr.rs @@ -27,6 +27,7 @@ use vortex::expr::not; use vortex::expr::or_collect; use vortex::expr::root; use vortex::scalar::Scalar; +use vortex::scalar_fn::EmptyOptions; use vortex::scalar_fn::ScalarFnVTableExt; use vortex::scalar_fn::fns::between::Between; use vortex::scalar_fn::fns::between::BetweenOptions; @@ -36,6 +37,10 @@ use vortex::scalar_fn::fns::like::Like; use vortex::scalar_fn::fns::like::LikeOptions; use vortex::scalar_fn::fns::literal::Literal; use vortex::scalar_fn::fns::operators::Operator; +use vortex_geo::extension::WellKnownBinary; +use vortex_geo::extension::point_2d_scalar; +use vortex_geo::scalar_fn::distance::GeoDistance; +use vortex_geo::scalar_fn::intersects::GeoIntersects; use crate::cpp::DUCKDB_VX_EXPR_TYPE; use crate::duckdb; @@ -57,11 +62,86 @@ fn from_bound_str(value: &duckdb::ExpressionRef) -> VortexResult { } } +/// Read an `f64` from a constant expression (e.g. an `ST_Point` coordinate literal). +fn from_bound_f64(value: &duckdb::ExpressionRef) -> VortexResult { + match value.as_class().vortex_expect("unknown class") { + BoundConstant(constant) => f64::try_from(&Scalar::try_from(constant.value)?), + _ => vortex_bail!("Expected f64 constant, got {:?}", value.as_class_id()), + } +} + +/// Convert an `ST_Distance` operand to a native geometry expression. A folded `ST_Point(..)` +/// constant arrives as WKB `GEOMETRY`; decode it once at plan time to a native `Point`, no per-row WKB. +fn geo_operand( + value: &duckdb::ExpressionRef, + col_sub: Option<&Expression>, +) -> VortexResult> { + if let Some(BoundConstant(constant)) = value.as_class() { + let scalar = Scalar::try_from(constant.value)?; + if let Some(point) = point_scalar_from_geometry_const(&scalar)? { + return Ok(Some(lit(point))); + } + } + try_from_expression_inner(value, col_sub) +} + +/// Decode a constant WKB `Point` into a native `Point` scalar. `None` if it isn't a WKB constant or +/// isn't a Point — those fall through to the general geo path rather than being misread. +fn point_scalar_from_geometry_const(scalar: &Scalar) -> VortexResult> { + let DType::Extension(ext_dtype) = scalar.dtype() else { + return Ok(None); + }; + if !ext_dtype.is::() { + return Ok(None); + } + let storage = scalar.as_extension().to_storage_scalar(); + let Some(buf) = storage.as_binary_opt().and_then(|b| b.value()) else { + return Ok(None); + }; + let Some((x, y)) = wkb_2d_point_xy(buf.as_slice()) else { + return Ok(None); + }; + Ok(Some(point_2d_scalar(x, y)?)) +} + +/// Read `(x, y)` from a bare 2D WKB Point: 1-byte endianness, geometry-type `u32 == 1`, two f64s. +/// `None` for anything else (SRID/Z/M flags or non-Point types shift these fixed offsets). +fn wkb_2d_point_xy(bytes: &[u8]) -> Option<(f64, f64)> { + if bytes.len() < 21 { + return None; + } + let le = bytes[0] == 1; + let read_u32 = |offset: usize| -> u32 { + let mut chunk = [0u8; 4]; + chunk.copy_from_slice(&bytes[offset..offset + 4]); + if le { + u32::from_le_bytes(chunk) + } else { + u32::from_be_bytes(chunk) + } + }; + let read_f64 = |offset: usize| -> f64 { + let mut chunk = [0u8; 8]; + chunk.copy_from_slice(&bytes[offset..offset + 8]); + if le { + f64::from_le_bytes(chunk) + } else { + f64::from_be_bytes(chunk) + } + }; + // Geometry-type code 1 == bare 2D Point; anything else shifts the coordinate offsets, so bail. + if read_u32(1) != 1 { + return None; + } + Some((read_f64(5), read_f64(13))) +} + fn try_from_bound_function( func: &BoundFunction, col_sub: Option<&Expression>, ) -> VortexResult> { - let expr = match func.scalar_function.name() { + let name = func.scalar_function.name(); + let expr = match name { "strlen" => { let children: Vec<_> = func.children().collect(); vortex_ensure!(children.len() == 1); @@ -115,15 +195,73 @@ fn try_from_bound_function( }; Like.new_expr(LikeOptions::default(), [value, lit(pattern)]) } - _ => { - debug!("bound function {}", func.scalar_function.name()); - return Ok(None); + // Geo UDFs (and any unsupported function) are handled here. + _ => return try_from_geo_function(name, func, col_sub), + }; + + Ok(Some(expr)) +} + +/// Lower the geospatial UDFs to native Vortex geo ops over `Point` storage, so the work runs in the +/// scan instead of materializing geometry for DuckDB. `None` for any other function. +fn try_from_geo_function( + name: &str, + func: &BoundFunction, + col_sub: Option<&Expression>, +) -> VortexResult> { + let children: Vec<_> = func.children().collect(); + let expr = match name.to_ascii_lowercase().as_str() { + "st_distance" => { + vortex_ensure!(children.len() == 2); + let Some(a) = geo_operand(children[0], col_sub)? else { + return Ok(None); + }; + let Some(b) = geo_operand(children[1], col_sub)? else { + return Ok(None); + }; + GeoDistance.new_expr(EmptyOptions, [a, b]) + } + "st_intersects" => { + vortex_ensure!(children.len() == 2); + let Some(a) = geo_operand(children[0], col_sub)? else { + return Ok(None); + }; + let Some(b) = geo_operand(children[1], col_sub)? else { + return Ok(None); + }; + GeoIntersects.new_expr(EmptyOptions, [a, b]) + } + "st_point" => { + vortex_ensure!(children.len() == 2); + lit(point_2d_scalar( + from_bound_f64(children[0])?, + from_bound_f64(children[1])?, + )?) } + coord @ ("st_x" | "st_y") => { + vortex_ensure!(children.len() == 1); + let Some(child) = try_from_expression_inner(children[0], col_sub)? else { + return Ok(None); + }; + // "st_x" -> "x", "st_y" -> "y" + get_item(&coord[3..], child) + } + _ => return Ok(None), }; Ok(Some(expr)) } +/// Whether `name` is a geo UDF that `try_from_geo_function` lowers — shared with +/// `can_push_expression` so the pushable and lowered sets can't drift. Case-insensitive since +/// DuckDB keeps the registered case (e.g. `ST_Distance`). +fn is_geo_function(name: &str) -> bool { + matches!( + name.to_ascii_lowercase().as_str(), + "st_distance" | "st_intersects" | "st_point" | "st_x" | "st_y" + ) +} + pub fn try_from_bound_expression( value: &duckdb::ExpressionRef, ) -> VortexResult> { @@ -166,13 +304,17 @@ pub fn can_push_expression(value: &duckdb::ExpressionRef) -> bool { BoundConjunction(conj) => conj.children().all(can_push_expression), ExpressionClass::BoundFunction(func) => { let name = func.scalar_function.name(); - name == "struct_extract" - || name == "contains" - || name == "prefix" - || name == "suffix" - || name == "~~" - || name == "!~~" - || name == "strlen" + // A geo UDF is pushable when all its operands are; `try_from_geo_function` lowers it. + // Built-in names are always lowercase; geo UDFs keep their registered case. + match name { + "struct_extract" | "contains" | "prefix" | "suffix" | "~~" | "!~~" | "strlen" => { + true + } + _ if is_geo_function(name) => { + matches!(try_from_geo_function(name, &func, None), Ok(Some(_))) + } + _ => false, + } } ExpressionClass::BoundOperator(op) => { if !matches!( diff --git a/vortex-duckdb/src/exporter/extension.rs b/vortex-duckdb/src/exporter/extension.rs index 221dc92a85f..07be6d00dc0 100644 --- a/vortex-duckdb/src/exporter/extension.rs +++ b/vortex-duckdb/src/exporter/extension.rs @@ -8,6 +8,10 @@ use vortex::array::arrays::extension::ExtensionArrayExt; use vortex::array::extension::datetime::AnyTemporal; use vortex::error::VortexResult; use vortex::error::vortex_bail; +use vortex_geo::extension::Point; +use vortex_geo::extension::PointData; +use vortex_geo::extension::Polygon; +use vortex_geo::extension::PolygonData; use vortex_geo::extension::WellKnownBinary; use vortex_geo::extension::WellKnownBinaryData; @@ -27,5 +31,13 @@ pub(crate) fn new_exporter( return geo::new_wkb_exporter(WellKnownBinaryData::try_from(ext)?, ctx); } + if ext.ext_dtype().is::() { + return geo::new_point_exporter(PointData::try_from(ext)?, ctx); + } + + if ext.ext_dtype().is::() { + return geo::new_polygon_exporter(PolygonData::try_from(ext)?, ctx); + } + vortex_bail!("no non-temporal extension exporter") } diff --git a/vortex-duckdb/src/exporter/geo.rs b/vortex-duckdb/src/exporter/geo.rs index 1287ed019e2..6137c56fd5a 100644 --- a/vortex-duckdb/src/exporter/geo.rs +++ b/vortex-duckdb/src/exporter/geo.rs @@ -4,6 +4,8 @@ use vortex::array::ExecutionCtx; use vortex::array::arrays::VarBinViewArray; use vortex::error::VortexResult; +use vortex_geo::extension::PointData; +use vortex_geo::extension::PolygonData; use vortex_geo::extension::WellKnownBinaryData; use crate::exporter::ColumnExporter; @@ -17,3 +19,24 @@ pub(crate) fn new_wkb_exporter( let values = array.wkb_values().clone().execute::(ctx)?; new_exporter(values, ctx) } + +/// Create an exporter for a native `Point` column. DuckDB `GEOMETRY` vectors carry WKB, so the +/// points are serialized to WKB via [`PointData::to_wkb`] (only for rows DuckDB materializes — +/// with predicate pushdown that's just the survivors). +pub(crate) fn new_point_exporter( + point: PointData, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let values = point.to_wkb(ctx)?.execute::(ctx)?; + new_exporter(values, ctx) +} + +/// Create an exporter for a native `Polygon` column. DuckDB `GEOMETRY` vectors carry WKB, so the +/// polygons are serialized to WKB via [`PolygonData::to_wkb`] (only for rows DuckDB materializes). +pub(crate) fn new_polygon_exporter( + polygon: PolygonData, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let values = polygon.to_wkb(ctx)?.execute::(ctx)?; + new_exporter(values, ctx) +} diff --git a/vortex-geo/Cargo.toml b/vortex-geo/Cargo.toml index e2f7e4dc10f..068831b6a62 100644 --- a/vortex-geo/Cargo.toml +++ b/vortex-geo/Cargo.toml @@ -20,6 +20,7 @@ geo = { workspace = true } geo-traits = { workspace = true } geo-types = { workspace = true } geoarrow = { workspace = true } +geoarrow-cast = { workspace = true } prost = { workspace = true } vortex-array = { workspace = true } vortex-error = { workspace = true } @@ -28,6 +29,7 @@ wkb = { workspace = true } [dev-dependencies] rstest = { workspace = true } +vortex-layout = { workspace = true } [lints] workspace = true diff --git a/vortex-geo/src/aggregate_fn/bounds.rs b/vortex-geo/src/aggregate_fn/bounds.rs new file mode 100644 index 00000000000..1d31f9e3ff1 --- /dev/null +++ b/vortex-geo/src/aggregate_fn/bounds.rs @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! An aggregate computing the minimum bounding rectangle (2D) of a native +//! geometry column as `Struct`. Stored as a zone statistic, it lets spatial +//! filters prune chunks whose bounding box cannot intersect the query region. + +use vortex_array::ArrayRef; +use vortex_array::Columnar; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::aggregate_fn::AggregateFnId; +use vortex_array::aggregate_fn::AggregateFnRef; +use vortex_array::aggregate_fn::AggregateFnVTable; +use vortex_array::aggregate_fn::AggregateFnVTableExt; +use vortex_array::aggregate_fn::EmptyOptions; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::struct_::StructArrayExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::dtype::StructFields; +use vortex_array::scalar::Scalar; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_session::VortexSession; + +use crate::extension::coordinates; +use crate::extension::is_native_geometry; + +/// Aggregate computing the minimum bounding rectangle of a native geometry column, as +/// `Struct` of `f64`. +#[derive(Clone, Debug)] +pub struct GeometryBounds; + +/// An axis-aligned bounding box: four `f64`s, all this aggregate needs to min/max coordinates. +#[derive(Clone, Copy)] +struct Bbox { + xmin: f64, + ymin: f64, + xmax: f64, + ymax: f64, +} + +impl Bbox { + /// The smallest box containing both `self` and `other`. + fn union(self, other: Bbox) -> Bbox { + Bbox { + xmin: self.xmin.min(other.xmin), + ymin: self.ymin.min(other.ymin), + xmax: self.xmax.max(other.xmax), + ymax: self.ymax.max(other.ymax), + } + } +} + +/// Partial MBR accumulator: the union of every bounding box seen so far, or `None` when empty. +pub struct BoundsPartial { + bbox: Option, +} + +impl BoundsPartial { + fn merge(&mut self, other: Bbox) { + self.bbox = Some(match self.bbox { + Some(cur) => cur.union(other), + None => other, + }); + } +} + +/// `Struct` of `f64`, nullable so an empty group yields a null MBR. The +/// coordinate fields are themselves nullable so that extracting one from the nullable struct (as the +/// pruning proof does) keeps a consistent nullable dtype. +fn bounds_dtype() -> DType { + let coord = DType::Primitive(PType::F64, Nullability::Nullable); + DType::Struct( + StructFields::from_iter([ + ("xmin", coord.clone()), + ("ymin", coord.clone()), + ("xmax", coord.clone()), + ("ymax", coord), + ]), + Nullability::Nullable, + ) +} + +/// The bounding box of the coordinate slices, or `None` for an empty chunk. +fn bounds_of(xs: &[f64], ys: &[f64]) -> Option { + if xs.is_empty() { + return None; + } + let min_max = |vals: &[f64]| { + vals.iter() + .fold((f64::INFINITY, f64::NEG_INFINITY), |(lo, hi), &v| { + (lo.min(v), hi.max(v)) + }) + }; + let (xmin, xmax) = min_max(xs); + let (ymin, ymax) = min_max(ys); + Some(Bbox { + xmin, + ymin, + xmax, + ymax, + }) +} + +impl AggregateFnVTable for GeometryBounds { + type Options = EmptyOptions; + type Partial = BoundsPartial; + + fn id(&self) -> AggregateFnId { + AggregateFnId::new("vortex.geo.bounds") + } + + // Serializable so the zoned writer can persist this as a per-chunk stat. No options to encode. + fn serialize(&self, _options: &Self::Options) -> VortexResult>> { + Ok(Some(vec![])) + } + + fn deserialize(&self, _metadata: &[u8], _session: &VortexSession) -> VortexResult { + Ok(EmptyOptions) + } + + fn return_dtype(&self, _options: &Self::Options, input_dtype: &DType) -> Option { + is_native_geometry(input_dtype).then(bounds_dtype) + } + + fn zone_stat_default(&self, input_dtype: &DType) -> Option { + // Geometry columns get a per-chunk bounding box for pruning. + is_native_geometry(input_dtype).then(|| self.bind(EmptyOptions)) + } + + fn partial_dtype(&self, options: &Self::Options, input_dtype: &DType) -> Option { + self.return_dtype(options, input_dtype) + } + + fn empty_partial( + &self, + _options: &Self::Options, + _input_dtype: &DType, + ) -> VortexResult { + Ok(BoundsPartial { bbox: None }) + } + + fn combine_partials(&self, partial: &mut Self::Partial, other: Scalar) -> VortexResult<()> { + if other.is_null() { + return Ok(()); + } + let fields = other.as_struct(); + let read = |name: &str| -> VortexResult { + f64::try_from( + &fields + .field(name) + .ok_or_else(|| vortex_err!("bounds missing {name}"))?, + ) + }; + partial.merge(Bbox { + xmin: read("xmin")?, + ymin: read("ymin")?, + xmax: read("xmax")?, + ymax: read("ymax")?, + }); + Ok(()) + } + + fn to_scalar(&self, partial: &Self::Partial) -> VortexResult { + Ok(match partial.bbox { + Some(b) => Scalar::struct_( + bounds_dtype(), + vec![ + Scalar::primitive(b.xmin, Nullability::Nullable), + Scalar::primitive(b.ymin, Nullability::Nullable), + Scalar::primitive(b.xmax, Nullability::Nullable), + Scalar::primitive(b.ymax, Nullability::Nullable), + ], + ), + None => Scalar::null(bounds_dtype()), + }) + } + + fn reset(&self, partial: &mut Self::Partial) { + partial.bbox = None; + } + + fn is_saturated(&self, _partial: &Self::Partial) -> bool { + // A bounding box can always grow, so it is never saturated. + false + } + + fn accumulate( + &self, + partial: &mut Self::Partial, + batch: &Columnar, + ctx: &mut ExecutionCtx, + ) -> VortexResult<()> { + let array = match batch { + Columnar::Canonical(canonical) => canonical.clone().into_array(), + Columnar::Constant(constant) => constant.clone().into_array(), + }; + let coords = coordinates(&array, ctx)?; + let xs = coords + .unmasked_field_by_name("x")? + .clone() + .execute::(ctx)?; + let ys = coords + .unmasked_field_by_name("y")? + .clone() + .execute::(ctx)?; + if let Some(bbox) = bounds_of(xs.as_slice::(), ys.as_slice::()) { + partial.merge(bbox); + } + Ok(()) + } + + fn finalize(&self, partials: ArrayRef) -> VortexResult { + // The stored partial is already the MBR struct, so finalizing is the identity. + Ok(partials) + } + + fn finalize_scalar(&self, partial: &Self::Partial) -> VortexResult { + self.to_scalar(partial) + } +} + +#[cfg(test)] +mod tests { + use vortex_array::VortexSessionExecute; + use vortex_array::aggregate_fn::Accumulator; + use vortex_array::aggregate_fn::AggregateFnVTable; + use vortex_array::aggregate_fn::DynAccumulator; + use vortex_array::aggregate_fn::EmptyOptions; + use vortex_array::scalar::Scalar; + use vortex_error::VortexResult; + use vortex_error::vortex_err; + + use super::GeometryBounds; + use crate::test_harness::point_column; + use crate::test_harness::polygon_column; + + /// The aggregate must be serializable so the zoned writer can persist its zone-stat descriptor. + #[test] + fn serializes_for_zone_storage() -> VortexResult<()> { + let session = vortex_array::array_session(); + let metadata = GeometryBounds + .serialize(&EmptyOptions)? + .expect("GeometryBounds must be serializable to be stored as a zone statistic"); + GeometryBounds.deserialize(&metadata, &session)?; + Ok(()) + } + + /// The MBR result's corners as `(xmin, ymin, xmax, ymax)`. + fn mbr(result: &Scalar) -> VortexResult<(f64, f64, f64, f64)> { + let fields = result.as_struct(); + let read = |name: &str| -> VortexResult { + f64::try_from( + &fields + .field(name) + .ok_or_else(|| vortex_err!("missing {name}"))?, + ) + }; + Ok((read("xmin")?, read("ymin")?, read("xmax")?, read("ymax")?)) + } + + /// The MBR of a Point column is the min/max of its coordinates, accumulated across batches. + #[test] + fn point_bounds_across_batches() -> VortexResult<()> { + let session = vortex_array::array_session(); + let mut ctx = session.create_execution_ctx(); + + let dtype = point_column(vec![0.0], vec![0.0])?.dtype().clone(); + let mut acc = Accumulator::try_new(GeometryBounds, EmptyOptions, dtype)?; + + acc.accumulate(&point_column(vec![1.0, 3.0], vec![2.0, 4.0])?, &mut ctx)?; + acc.accumulate(&point_column(vec![-1.0], vec![5.0])?, &mut ctx)?; + + assert_eq!(mbr(&acc.finish()?)?, (-1.0, 2.0, 3.0, 5.0)); + Ok(()) + } + + /// The MBR of a Polygon column is the min/max over every ring vertex of every polygon — + /// exercising the `List>` unwrap, not just the bare Point struct. + #[test] + fn polygon_bounds_union_all_vertices() -> VortexResult<()> { + let session = vortex_array::array_session(); + let mut ctx = session.create_execution_ctx(); + + // Two rectangles: (0,0)-(2,3) and (5,5)-(7,8). The chunk MBR is their union: (0,0)-(7,8). + let polygons = polygon_column(vec![ + vec![vec![(0.0, 0.0), (2.0, 0.0), (2.0, 3.0), (0.0, 3.0)]], + vec![vec![(5.0, 5.0), (7.0, 5.0), (7.0, 8.0), (5.0, 8.0)]], + ])?; + let dtype = polygons.dtype().clone(); + let mut acc = Accumulator::try_new(GeometryBounds, EmptyOptions, dtype)?; + acc.accumulate(&polygons, &mut ctx)?; + + assert_eq!(mbr(&acc.finish()?)?, (0.0, 0.0, 7.0, 8.0)); + Ok(()) + } + + /// An empty group yields a null MBR. + #[test] + fn empty_group_is_null() -> VortexResult<()> { + let dtype = point_column(vec![0.0], vec![0.0])?.dtype().clone(); + let mut acc = Accumulator::try_new(GeometryBounds, EmptyOptions, dtype)?; + assert!(acc.finish()?.is_null()); + Ok(()) + } + + /// After `initialize`, the registry yields a default zone statistic for geometry columns (so the + /// zoned writer stores it) but none for ordinary numeric columns. + #[test] + fn registered_as_geometry_zone_default() -> VortexResult<()> { + use vortex_array::aggregate_fn::session::AggregateFnSessionExt; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::dtype::PType; + + let session = vortex_array::array_session(); + crate::initialize(&session); + + let point_dtype = point_column(vec![0.0], vec![0.0])?.dtype().clone(); + assert!( + !session + .aggregate_fns() + .zone_stat_defaults(&point_dtype) + .is_empty(), + "a geometry zone-stat default should be discovered for Point columns" + ); + let i32_dtype = DType::Primitive(PType::I32, Nullability::NonNullable); + assert!( + session + .aggregate_fns() + .zone_stat_defaults(&i32_dtype) + .is_empty(), + "no geometry zone-stat default should apply to numeric columns" + ); + Ok(()) + } +} diff --git a/vortex-geo/src/aggregate_fn/mod.rs b/vortex-geo/src/aggregate_fn/mod.rs new file mode 100644 index 00000000000..22866cbfc06 --- /dev/null +++ b/vortex-geo/src/aggregate_fn/mod.rs @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Aggregate functions over geometry columns, for use as zone statistics. + +mod bounds; + +pub use bounds::*; diff --git a/vortex-geo/src/extension/mod.rs b/vortex-geo/src/extension/mod.rs index 684c83bade0..c785fa33ffa 100644 --- a/vortex-geo/src/extension/mod.rs +++ b/vortex-geo/src/extension/mod.rs @@ -9,6 +9,7 @@ mod wkb; use std::fmt::Display; use std::sync::Arc; +use geo_traits::to_geo::ToGeoGeometry; use geo_types::Geometry; use geoarrow::datatypes::Crs; use geoarrow::datatypes::Metadata; @@ -19,13 +20,44 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::StructArray; use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::dtype::DType; use vortex_array::scalar::Scalar; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; pub use wkb::*; +/// Whether `dtype` is a native geometry extension. +pub(crate) fn is_native_geometry(dtype: &DType) -> bool { + dtype + .as_extension_opt() + .is_some_and(|ext| ext.is::() || ext.is::()) +} + +/// The flat coordinate `Struct` of a native geometry column. +pub(crate) fn coordinates(array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let Some(ext) = array.dtype().as_extension_opt() else { + vortex_bail!( + "geo: operand is not a geometry extension type, was {}", + array.dtype() + ); + }; + let storage = array + .clone() + .execute::(ctx)? + .storage_array() + .clone(); + if ext.is::() { + point_coordinates(&storage, ctx) + } else if ext.is::() { + polygon_coordinates(&storage, ctx) + } else { + vortex_bail!("geo: unsupported geometry extension {}", array.dtype()) + } +} + /// Decode a native geometry column to `geo_types`. A non-geometry operand is an error. pub(crate) fn geometries( array: &ArrayRef, @@ -46,11 +78,29 @@ pub(crate) fn geometries( point_geometries(&storage, ctx) } else if ext.is::() { polygon_geometries(&storage, ctx) + } else if ext.is::() { + wkb_geometries(&storage, ctx) } else { vortex_bail!("geo: unsupported geometry extension {}", array.dtype()) } } +/// Decode WKB storage to `geo_types` via the `wkb` crate. Used when a geometry operand arrives as a +/// `WellKnownBinary` constant (e.g. a folded `ST_GeomFromText` polygon literal pushed down from +/// DuckDB) rather than a native column. +fn wkb_geometries(storage: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult>> { + (0..storage.len()) + .map(|i| { + let scalar = storage.clone().execute_scalar(i, ctx)?; + let binary = scalar.as_binary(); + let bytes = binary + .value() + .ok_or_else(|| vortex_err!("geo: null geometry is not supported"))?; + Ok(Wkb::try_from_bytes(bytes.as_slice())?.to_geometry()) + }) + .collect() +} + /// Decode a constant operand scalar to one geo geometry, a constant of any /// supported geometry type is decoded exactly like a column. pub(crate) fn single_geometry( diff --git a/vortex-geo/src/extension/point.rs b/vortex-geo/src/extension/point.rs index 19e33c212f5..76aab95d333 100644 --- a/vortex-geo/src/extension/point.rs +++ b/vortex-geo/src/extension/point.rs @@ -12,16 +12,21 @@ use arrow_schema::Field; use arrow_schema::extension::ExtensionType; use geo_traits::to_geo::ToGeoGeometry; use geo_types::Geometry; +use geoarrow::array::GeoArrowArray; use geoarrow::array::GeoArrowArrayAccessor; use geoarrow::array::IntoArrow; use geoarrow::array::PointArray; use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::GeoArrowType; use geoarrow::datatypes::PointType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; use prost::Message; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::StructArray; use vortex_array::arrays::extension::ExtensionArrayExt; use vortex_array::arrow::ArrowExport; use vortex_array::arrow::ArrowExportVTable; @@ -31,12 +36,14 @@ use vortex_array::arrow::ArrowSession; use vortex_array::arrow::ArrowSessionExt; use vortex_array::arrow::FromArrowArray; use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; use vortex_array::dtype::arrow::FromArrowType; use vortex_array::dtype::extension::ExtDType; use vortex_array::dtype::extension::ExtId; use vortex_array::dtype::extension::ExtVTable; use vortex_array::scalar::Scalar; use vortex_array::scalar::ScalarValue; +use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_ensure; use vortex_error::vortex_err; @@ -96,20 +103,32 @@ fn point_type(geo_metadata: &GeoMetadata, dimension: Dimension) -> PointType { PointType::new(dimension.into(), geoarrow_metadata(geo_metadata)) } -/// Decode `Point` storage to `geo_types` points, for the geo scalar functions. -pub(crate) fn point_geometries( +/// The coordinate `Struct` of `Point` storage. +pub(crate) fn point_coordinates( storage: &ArrayRef, ctx: &mut ExecutionCtx, -) -> VortexResult>> { +) -> VortexResult { + storage.clone().execute::(ctx) +} + +/// Build the GeoArrow [`PointArray`] from `Point` storage — shared by geometry decode and WKB export. +fn point_array(storage: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { let point_type = point_type( &GeoMetadata::default(), coordinate_dimension(storage.dtype())?, ); let session = ctx.session().clone(); let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; - let points = PointArray::try_from((arrow.as_ref(), point_type)) - .map_err(|e| vortex_err!("failed to construct PointArray: {e}"))?; - points + PointArray::try_from((arrow.as_ref(), point_type)) + .map_err(|e| vortex_err!("failed to construct PointArray: {e}")) +} + +/// Decode `Point` storage to `geo_types` points, for the geo scalar functions. +pub(crate) fn point_geometries( + storage: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult>> { + point_array(storage, ctx)? .iter() .map(|geometry| -> VortexResult> { Ok(geometry @@ -120,6 +139,45 @@ pub(crate) fn point_geometries( .collect() } +/// A validated `Point` array (`try_from` checks the extension type) — the entry point for WKB export. +pub struct PointData(ExtensionArray); + +impl TryFrom for PointData { + type Error = VortexError; + + fn try_from(ext: ExtensionArray) -> Result { + vortex_ensure!( + ext.ext_dtype().is::(), + "expected a Point extension array" + ); + Ok(PointData(ext)) + } +} + +impl PointData { + /// Serialize points to WKB (a view array) via geoarrow's cast — the form DuckDB `GEOMETRY` takes. + pub fn to_wkb(&self, ctx: &mut ExecutionCtx) -> VortexResult { + let points = point_array(&self.0.storage_array().clone(), ctx)?; + let wkb_type = + GeoArrowType::WkbView(WkbType::new(geoarrow_metadata(&GeoMetadata::default()))); + let wkb = cast(&points, &wkb_type) + .map_err(|e| vortex_err!("failed to cast points to WKB: {e}"))?; + ArrayRef::from_arrow(wkb.to_array_ref().as_ref(), false) + } +} + +/// A constant 2D `Point` scalar at `(x, y)`, no CRS (`2d`: builds only the `Xy` dimension). Lowers a +/// folded `ST_Point(x, y)` literal to a native operand for pushed-down geo functions. +pub fn point_2d_scalar(x: f64, y: f64) -> VortexResult { + let storage_dtype = coordinate_storage_dtype(Dimension::Xy, Nullability::NonNullable); + let storage = Scalar::struct_( + storage_dtype.clone(), + vec![Scalar::from(x), Scalar::from(y)], + ); + let ext = ExtDType::try_with_vtable(Point, GeoMetadata::default(), storage_dtype)?.erased(); + Scalar::try_new(DType::Extension(ext), storage.into_value()) +} + impl ArrowExportVTable for Point { fn arrow_ext_id(&self) -> Id { *ARROW_POINT diff --git a/vortex-geo/src/extension/polygon.rs b/vortex-geo/src/extension/polygon.rs index fc06ce59bd3..114a66241c2 100644 --- a/vortex-geo/src/extension/polygon.rs +++ b/vortex-geo/src/extension/polygon.rs @@ -13,17 +13,24 @@ use arrow_schema::Field; use arrow_schema::extension::ExtensionType; use geo_traits::to_geo::ToGeoGeometry; use geo_types::Geometry; +use geoarrow::array::GeoArrowArray; use geoarrow::array::GeoArrowArrayAccessor; use geoarrow::array::IntoArrow; use geoarrow::array::PolygonArray; use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::GeoArrowType; use geoarrow::datatypes::PolygonType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; use prost::Message; use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::StructArray; use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrays::listview::ListViewArrayExt; use vortex_array::arrow::ArrowExport; use vortex_array::arrow::ArrowExportVTable; use vortex_array::arrow::ArrowImport; @@ -38,6 +45,7 @@ use vortex_array::dtype::extension::ExtDType; use vortex_array::dtype::extension::ExtId; use vortex_array::dtype::extension::ExtVTable; use vortex_array::scalar::ScalarValue; +use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; @@ -111,18 +119,42 @@ fn polygon_type(geo_metadata: &GeoMetadata, dimension: Dimension) -> PolygonType PolygonType::new(dimension.into(), geoarrow_metadata(geo_metadata)) } -/// Decode `Polygon` storage (`List>`) to `geo_types` polygons, for the geo scalar -/// functions. CRS does not affect planar geometry ops, so default metadata is used. -pub(crate) fn polygon_geometries( +/// The coordinate `Struct` of `Polygon` storage, flattening both `List` levels of +/// `List>` to every vertex of every ring. +pub(crate) fn polygon_coordinates( storage: &ArrayRef, ctx: &mut ExecutionCtx, -) -> VortexResult>> { +) -> VortexResult { + // Peel the outer ring list, then each ring's coordinate list, leaving the coordinate struct. + let rings = storage + .clone() + .execute::(ctx)? + .into_listview() + .elements() + .clone(); + let coords = rings + .execute::(ctx)? + .into_listview() + .elements() + .clone(); + coords.execute::(ctx) +} + +/// Build the GeoArrow [`PolygonArray`] from `Polygon` storage — shared by geometry decode and WKB export. +fn polygon_array(storage: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { let polygon_type = polygon_type(&GeoMetadata::default(), polygon_dimension(storage.dtype())?); let session = ctx.session().clone(); let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; - let polygons = PolygonArray::try_from((arrow.as_ref(), polygon_type)) - .map_err(|e| vortex_err!("failed to construct PolygonArray: {e}"))?; - polygons + PolygonArray::try_from((arrow.as_ref(), polygon_type)) + .map_err(|e| vortex_err!("failed to construct PolygonArray: {e}")) +} + +/// Decode `Polygon` storage to `geo_types` polygons, for the geo scalar functions. +pub(crate) fn polygon_geometries( + storage: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult>> { + polygon_array(storage, ctx)? .iter() .map(|geometry| -> VortexResult> { Ok(geometry @@ -133,6 +165,33 @@ pub(crate) fn polygon_geometries( .collect() } +/// A validated `Polygon` array (`try_from` checks the extension type) — the entry point for WKB export. +pub struct PolygonData(ExtensionArray); + +impl TryFrom for PolygonData { + type Error = VortexError; + + fn try_from(ext: ExtensionArray) -> Result { + vortex_ensure!( + ext.ext_dtype().is::(), + "expected a Polygon extension array" + ); + Ok(PolygonData(ext)) + } +} + +impl PolygonData { + /// Serialize polygons to WKB (a view array) via geoarrow's cast — the form DuckDB `GEOMETRY` takes. + pub fn to_wkb(&self, ctx: &mut ExecutionCtx) -> VortexResult { + let polygons = polygon_array(&self.0.storage_array().clone(), ctx)?; + let wkb_type = + GeoArrowType::WkbView(WkbType::new(geoarrow_metadata(&GeoMetadata::default()))); + let wkb = cast(&polygons, &wkb_type) + .map_err(|e| vortex_err!("failed to cast polygons to WKB: {e}"))?; + ArrayRef::from_arrow(wkb.to_array_ref().as_ref(), false) + } +} + impl ArrowExportVTable for Polygon { fn arrow_ext_id(&self) -> Id { *ARROW_POLYGON diff --git a/vortex-geo/src/lib.rs b/vortex-geo/src/lib.rs index 951d93b7b4f..d73ec4f60d1 100644 --- a/vortex-geo/src/lib.rs +++ b/vortex-geo/src/lib.rs @@ -3,17 +3,24 @@ use std::sync::Arc; +use vortex_array::aggregate_fn::session::AggregateFnSessionExt; use vortex_array::arrow::ArrowSessionExt; use vortex_array::dtype::session::DTypeSessionExt; use vortex_array::scalar_fn::session::ScalarFnSessionExt; +use vortex_array::stats::session::StatsSessionExt; use vortex_session::VortexSession; +use crate::aggregate_fn::GeometryBounds; use crate::extension::Point; use crate::extension::Polygon; use crate::extension::WellKnownBinary; +use crate::prune::GeoDistanceBoundsPrune; use crate::scalar_fn::distance::GeoDistance; +use crate::scalar_fn::intersects::GeoIntersects; +pub mod aggregate_fn; pub mod extension; +pub mod prune; pub mod scalar_fn; #[cfg(test)] mod test_harness; @@ -35,4 +42,11 @@ pub fn initialize(session: &VortexSession) { // Register the geometry scalar functions. session.scalar_fns().register(GeoDistance); + session.scalar_fns().register(GeoIntersects); + + // The bounding-box aggregate; self-declares as a per-chunk zone stat for geometry columns. + session.aggregate_fns().register(GeometryBounds); + + // Register the spatial pruning rule that uses that bounding box. + session.stats().register_rewrite(GeoDistanceBoundsPrune); } diff --git a/vortex-geo/src/prune.rs b/vortex-geo/src/prune.rs new file mode 100644 index 00000000000..8a2604e19b7 --- /dev/null +++ b/vortex-geo/src/prune.rs @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Stats-rewrite pruning for spatial filters, backed by the per-chunk [`GeometryBounds`] MBR. +//! +//! [`GeoDistanceBoundsPrune`] falsifies `ST_Distance(geom, const) <= r` (or `< r`): any row within +//! `r` of the constant must lie inside the constant's bounding box expanded by `r`, so if a chunk's +//! geometry MBR is disjoint from that expanded box, no row in the chunk can match and the chunk is +//! skipped. +//! +//! # Limitations +//! +//! Only the "near" forms `<= r` / `< r` are handled — the predicates a radius/within search uses. +//! They prune via the MBR's *lower* distance bound (the nearest point of the box to `const`). Every +//! other comparison falls through to `None`, leaving the chunk to be scanned — correct, just not +//! pruned: +//! +//! - `> r` / `>= r` are *soundly* prunable via the symmetric upper bound (the farthest corner of the +//! MBR being `<= r` proves every geometry is within `r`), but are intentionally omitted: "far from" +//! filters are rare and rarely selective, so the prune would almost never fire. +//! - `== r` would need both bounds at once and is not a realistic query. +//! - `!= r` is unprunable: a bounding box cannot prove every row sits at exactly distance `r`. + +use geo::BoundingRect; +use vortex_array::VortexSessionExecute; +use vortex_array::aggregate_fn::AggregateFnVTableExt; +use vortex_array::aggregate_fn::EmptyOptions; +use vortex_array::expr::Expression; +use vortex_array::expr::get_item; +use vortex_array::expr::gt; +use vortex_array::expr::is_root; +use vortex_array::expr::lit; +use vortex_array::expr::lt; +use vortex_array::expr::or; +use vortex_array::scalar_fn::ScalarFnId; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::scalar_fn::fns::binary::Binary; +use vortex_array::scalar_fn::fns::literal::Literal; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::stats::rewrite::StatsRewriteCtx; +use vortex_array::stats::rewrite::StatsRewriteRule; +use vortex_array::stats::stat; +use vortex_error::VortexResult; + +use crate::aggregate_fn::GeometryBounds; +use crate::extension::single_geometry; +use crate::scalar_fn::distance::GeoDistance; + +/// Prunes chunks for `GeoDistance(geom, const) <= r` / `< r` using the chunk's [`GeometryBounds`] +/// MBR. Registered against the comparison's scalar-function id, since the comparison — not +/// `GeoDistance` — is the predicate root. +#[derive(Debug)] +pub struct GeoDistanceBoundsPrune; + +impl StatsRewriteRule for GeoDistanceBoundsPrune { + fn scalar_fn_id(&self) -> ScalarFnId { + Binary.id() + } + + fn falsify( + &self, + expr: &Expression, + ctx: &StatsRewriteCtx<'_>, + ) -> VortexResult> { + // Only the "near" forms `<= r` / `< r` are pruned; every other comparison is left to the + // scan (see the module-level Limitations for why). + match expr.as_::() { + Operator::Lte | Operator::Lt => {} + _ => return Ok(None), + } + let distance = expr.child(0); + let threshold = expr.child(1); + + // The left operand must be `GeoDistance(geom, const)`. + if distance.as_opt::().is_none() { + return Ok(None); + } + + // Identify the geometry column (the scope root) and the constant geometry operand; distance + // is symmetric, so the constant may be on either side. + let (lhs, rhs) = (distance.child(0), distance.child(1)); + let (geom, constant) = if is_root(lhs) { + (lhs, rhs) + } else if is_root(rhs) { + (rhs, lhs) + } else { + return Ok(None); + }; + + let (Some(constant), Some(radius)) = + (constant.as_opt::(), threshold.as_opt::()) + else { + return Ok(None); + }; + let Ok(radius) = f64::try_from(radius) else { + return Ok(None); + }; + + // Bounding box of the constant geometry, expanded by the radius. + let mut exec = ctx.session().create_execution_ctx(); + let Some(rect) = single_geometry(constant, &mut exec)?.bounding_rect() else { + return Ok(None); + }; + let (xmin, xmax) = (rect.min().x - radius, rect.max().x + radius); + let (ymin, ymax) = (rect.min().y - radius, rect.max().y + radius); + + // Chunk MBR disjoint from the expanded box (on any axis), if no row can match then prune. + let mbr = stat(geom.clone(), GeometryBounds.bind(EmptyOptions)); + let proof = or( + or( + lt(get_item("xmax", mbr.clone()), lit(xmin)), + gt(get_item("xmin", mbr.clone()), lit(xmax)), + ), + or( + lt(get_item("ymax", mbr.clone()), lit(ymin)), + gt(get_item("ymin", mbr), lit(ymax)), + ), + ); + Ok(Some(proof)) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use rstest::rstest; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::aggregate_fn::AggregateFnVTableExt; + use vortex_array::aggregate_fn::EmptyOptions as AggregateEmptyOptions; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::StructArray; + use vortex_array::expr::Expression; + use vortex_array::expr::lit; + use vortex_array::expr::lt_eq; + use vortex_array::expr::root; + use vortex_array::scalar_fn::EmptyOptions; + use vortex_array::scalar_fn::ScalarFnVTableExt; + use vortex_array::scalar_fn::fns::binary::Binary; + use vortex_array::scalar_fn::fns::operators::Operator; + use vortex_array::stats::rewrite::StatsRewriteCtx; + use vortex_array::stats::rewrite::StatsRewriteRule; + use vortex_array::validity::Validity; + use vortex_error::VortexResult; + use vortex_layout::layouts::zoned::zone_map::ZoneMap; + + use super::GeoDistanceBoundsPrune; + use crate::aggregate_fn::GeometryBounds; + use crate::scalar_fn::distance::GeoDistance; + use crate::test_harness::point_column; + + /// Run the rule against `GeoDistance(root, origin) 0.5` (operands swapped when + /// `geom_first` is false). Returns the falsity proof, if any. + fn falsify_distance(operator: Operator, geom_first: bool) -> VortexResult> { + let session = vortex_array::array_session(); + crate::initialize(&session); + let mut ctx = session.create_execution_ctx(); + + let scope = point_column(vec![0.0], vec![0.0])?.dtype().clone(); + let origin = point_column(vec![0.0], vec![0.0])?.execute_scalar(0, &mut ctx)?; + let operands = if geom_first { + [root(), lit(origin)] + } else { + [lit(origin), root()] + }; + let distance = GeoDistance.new_expr(EmptyOptions, operands); + let predicate = Binary.new_expr(operator, [distance, lit(0.5f64)]); + + GeoDistanceBoundsPrune.falsify(&predicate, &StatsRewriteCtx::new(&session, &scope)) + } + + /// Only the upper-bounded "near" forms (`<=`/`<`) are pruned; the rest are left to the scan. + #[rstest] + #[case(Operator::Lte, true)] + #[case(Operator::Lt, true)] + #[case(Operator::Gt, false)] + #[case(Operator::Gte, false)] + #[case(Operator::Eq, false)] + #[case(Operator::NotEq, false)] + fn prunes_only_near_distance( + #[case] operator: Operator, + #[case] prunes: bool, + ) -> VortexResult<()> { + assert_eq!(falsify_distance(operator, true)?.is_some(), prunes); + Ok(()) + } + + /// Distance is symmetric: `GeoDistance(const, geom) <= r` falsifies just like the geom-first form. + #[test] + fn falsifies_with_constant_as_left_operand() -> VortexResult<()> { + assert!(falsify_distance(Operator::Lte, false)?.is_some()); + Ok(()) + } + + /// A comparison that does not wrap `GeoDistance` is left untouched. + #[test] + fn ignores_non_distance_comparison() -> VortexResult<()> { + let session = vortex_array::array_session(); + crate::initialize(&session); + let scope = point_column(vec![0.0], vec![0.0])?.dtype().clone(); + + let predicate = lt_eq(lit(1.0f64), lit(2.0f64)); + let ctx = StatsRewriteCtx::new(&session, &scope); + assert!(GeoDistanceBoundsPrune.falsify(&predicate, &ctx)?.is_none()); + Ok(()) + } + + /// `falsify` to `ZoneMap::prune` over a hand-built zone map: the far chunk is skipped, the near + /// one kept. + #[test] + fn prunes_far_chunk_keeps_near() -> VortexResult<()> { + let session = vortex_array::array_session(); + crate::initialize(&session); + let mut ctx = session.create_execution_ctx(); + + let point_dtype = point_column(vec![0.0], vec![0.0])?.dtype().clone(); + let bounds_fn = GeometryBounds.bind(AggregateEmptyOptions); + + // Two chunks: chunk 0 near the origin (MBR 0,0..1,1), chunk 1 far away (MBR 100,100..101,101). + let coord = + |a: f64, b: f64| PrimitiveArray::from_option_iter([Some(a), Some(b)]).into_array(); + let mbrs = StructArray::try_new( + ["xmin", "ymin", "xmax", "ymax"].into(), + vec![ + coord(0.0, 100.0), + coord(0.0, 100.0), + coord(1.0, 101.0), + coord(1.0, 101.0), + ], + 2, + Validity::AllValid, + )? + .into_array(); + let zone_array = StructArray::from_fields(&[(bounds_fn.to_string().as_str(), mbrs)])?; + let zone_map = + ZoneMap::try_new(point_dtype.clone(), zone_array, Arc::new([bounds_fn]), 1, 2)?; + + let origin = point_column(vec![0.0], vec![0.0])?.execute_scalar(0, &mut ctx)?; + let distance = GeoDistance.new_expr(EmptyOptions, [root(), lit(origin)]); + let predicate = lt_eq(distance, lit(0.5f64)); + let proof = predicate + .falsify(&point_dtype, &session)? + .expect("distance filter should be falsifiable"); + + // `true` means the zone is pruned: chunk 0 (near origin) is kept, chunk 1 (far) is skipped. + let mask = zone_map.prune(&proof, &session)?; + assert_eq!(mask.iter().collect::>(), vec![false, true]); + Ok(()) + } +} diff --git a/vortex-geo/src/scalar_fn/distance.rs b/vortex-geo/src/scalar_fn/distance.rs index feb7ea833aa..929112ee194 100644 --- a/vortex-geo/src/scalar_fn/distance.rs +++ b/vortex-geo/src/scalar_fn/distance.rs @@ -10,8 +10,12 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::Constant; use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::ScalarFnArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrays::struct_::StructArrayExt; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; @@ -27,6 +31,8 @@ use vortex_error::VortexResult; use vortex_error::vortex_ensure; use vortex_session::VortexSession; +use crate::extension::Point; +use crate::extension::coordinate::coordinate_from_struct; use crate::extension::geometries; use crate::extension::single_geometry; @@ -99,14 +105,29 @@ impl ScalarFnVTable for GeoDistance { (Some(query), None) => distances_to_constant(&b, query.scalar(), ctx), (None, Some(query)) => distances_to_constant(&a, query.scalar(), ctx), (None, None) => { - let ag = geometries(&a, ctx)?; - let bg = geometries(&b, ctx)?; vortex_ensure!( - ag.len() == bg.len(), + a.len() == b.len(), "geo distance: operand length mismatch {} vs {}", - ag.len(), - bg.len() + a.len(), + b.len() ); + // Fast path: two Point columns — distance over their `x`/`y` f64 buffers directly. + if is_point(a.dtype()) + && !a.dtype().is_nullable() + && is_point(b.dtype()) + && !b.dtype().is_nullable() + { + let (xa, ya) = point_xy(&a, ctx)?; + let (xb, yb) = point_xy(&b, ctx)?; + return Ok(point_distances( + xa.as_slice::().iter().copied(), + ya.as_slice::().iter().copied(), + xb.as_slice::().iter().copied(), + yb.as_slice::().iter().copied(), + )); + } + let ag = geometries(&a, ctx)?; + let bg = geometries(&b, ctx)?; let distances = ag.iter().zip(&bg).map(|(x, y)| Euclidean.distance(x, y)); Ok(PrimitiveArray::from_iter(distances).into_array()) } @@ -121,12 +142,70 @@ fn distances_to_constant( query: &Scalar, ctx: &mut ExecutionCtx, ) -> VortexResult { + // Fast path: Point column to a constant Point — distance over the column's `x`/`y` f64 buffers, + // broadcasting the constant, skipping the per-row `geo::Geometry` the general path materializes. + // (Polygons / other geometries fall through.) + if is_point(operand.dtype()) && !operand.dtype().is_nullable() && is_point(query.dtype()) { + let q = coordinate_from_struct(&query.as_extension().to_storage_scalar())?; + let (xs, ys) = point_xy(operand, ctx)?; + return Ok(point_distances( + xs.as_slice::().iter().copied(), + ys.as_slice::().iter().copied(), + std::iter::repeat(q.x), + std::iter::repeat(q.y), + )); + } + let query = single_geometry(query, ctx)?; let geoms = geometries(operand, ctx)?; let distances = geoms.iter().map(|g| Euclidean.distance(g, &query)); Ok(PrimitiveArray::from_iter(distances).into_array()) } +/// Extract the `x` and `y` `f64` columns from a native `Point` operand, for the columnar fast paths. +fn point_xy( + operand: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult<(PrimitiveArray, PrimitiveArray)> { + let storage = operand + .clone() + .execute::(ctx)? + .storage_array() + .clone() + .execute::(ctx)?; + let xs = storage + .unmasked_field_by_name("x")? + .clone() + .execute::(ctx)?; + let ys = storage + .unmasked_field_by_name("y")? + .clone() + .execute::(ctx)?; + Ok((xs, ys)) +} + +/// Per-row planar distance `sqrt(dx² + dy²)` between two streams of `(x, y)` f64 coordinates. Shared +/// by the point fast paths; a constant operand feeds its side as `repeat(c)` (zero-alloc broadcast). +fn point_distances( + xa: impl Iterator, + ya: impl Iterator, + xb: impl Iterator, + yb: impl Iterator, +) -> ArrayRef { + let distances = xa.zip(ya).zip(xb.zip(yb)).map(|((xa, ya), (xb, yb))| { + let (dx, dy) = (xa - xb, ya - yb); + (dx * dx + dy * dy).sqrt() + }); + PrimitiveArray::from_iter(distances).into_array() +} + +/// Whether `dtype` is the native `Point` extension (eligible for the columnar fast path). +fn is_point(dtype: &DType) -> bool { + dtype + .as_extension_opt() + .is_some_and(|ext| ext.is::()) +} + #[cfg(test)] mod tests { use vortex_array::ArrayRef; diff --git a/vortex-geo/src/scalar_fn/intersects.rs b/vortex-geo/src/scalar_fn/intersects.rs new file mode 100644 index 00000000000..58e87152daf --- /dev/null +++ b/vortex-geo/src/scalar_fn/intersects.rs @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! `ST_Intersects`: whether two native geometries intersect. + +use geo::Intersects; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::Constant; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::ScalarFnArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::Arity; +use vortex_array::scalar_fn::ChildName; +use vortex_array::scalar_fn::EmptyOptions; +use vortex_array::scalar_fn::ExecutionArgs; +use vortex_array::scalar_fn::ScalarFnId; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::scalar_fn::TypedScalarFnInstance; +use vortex_error::VortexResult; +use vortex_error::vortex_ensure; +use vortex_session::VortexSession; + +use crate::extension::geometries; +use crate::extension::single_geometry; + +/// `ST_Intersects` between two native geometry operands, computed by the `geo` crate. +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct GeoIntersects; + +impl GeoIntersects { + /// A lazy `ScalarFnArray` computing the per-row intersection predicate of `a` and `b`; either + /// may be constant. + pub fn try_new_array(a: ArrayRef, b: ArrayRef) -> VortexResult { + ScalarFnArray::try_new( + TypedScalarFnInstance::new(GeoIntersects, EmptyOptions).erased(), + vec![a, b], + ) + } +} + +impl ScalarFnVTable for GeoIntersects { + type Options = EmptyOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::new("vortex.geo.intersects") + } + + fn serialize(&self, _: &Self::Options) -> VortexResult>> { + Ok(Some(vec![])) + } + + fn deserialize(&self, _: &[u8], _: &VortexSession) -> VortexResult { + Ok(EmptyOptions) + } + + fn arity(&self, _: &Self::Options) -> Arity { + Arity::Exact(2) + } + + fn child_name(&self, _: &Self::Options, child_idx: usize) -> ChildName { + match child_idx { + 0 => ChildName::from("a"), + 1 => ChildName::from("b"), + _ => unreachable!("intersects has exactly two children"), + } + } + + fn return_dtype(&self, _: &Self::Options, _: &[DType]) -> VortexResult { + Ok(DType::Bool(Nullability::NonNullable)) + } + + fn execute( + &self, + _: &Self::Options, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let a = args.get(0)?; + let b = args.get(1)?; + match (a.as_opt::(), b.as_opt::()) { + (Some(qa), Some(qb)) => { + let ga = single_geometry(qa.scalar(), ctx)?; + let gb = single_geometry(qb.scalar(), ctx)?; + Ok(ConstantArray::new( + Scalar::bool(ga.intersects(&gb), Nullability::NonNullable), + a.len(), + ) + .into_array()) + } + (Some(query), None) => intersects_constant(&b, query.scalar(), ctx), + (None, Some(query)) => intersects_constant(&a, query.scalar(), ctx), + (None, None) => { + let ag = geometries(&a, ctx)?; + let bg = geometries(&b, ctx)?; + vortex_ensure!( + ag.len() == bg.len(), + "geo intersects: operand length mismatch {} vs {}", + ag.len(), + bg.len() + ); + let bits = ag.iter().zip(&bg).map(|(x, y)| x.intersects(y)); + Ok(BoolArray::from_iter(bits).into_array()) + } + } + } +} + +/// Intersection of each row of `operand` with a constant `query` geometry, decoded once and +/// broadcast. Intersection is symmetric, so this serves a constant on either side. +fn intersects_constant( + operand: &ArrayRef, + query: &Scalar, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let query = single_geometry(query, ctx)?; + let geoms = geometries(operand, ctx)?; + let bits = geoms.iter().map(|g| g.intersects(&query)); + Ok(BoolArray::from_iter(bits).into_array()) +} + +#[cfg(test)] +mod tests { + use geo_types::Coord; + use geo_types::Geometry; + use geo_types::LineString; + use geo_types::Polygon; + use vortex_array::ArrayRef; + use vortex_array::Canonical; + use vortex_array::ExecutionCtx; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::ConstantArray; + use vortex_error::VortexResult; + + use super::GeoIntersects; + use crate::test_harness::point_column; + use crate::test_harness::polygon_column; + use crate::test_harness::wkb_geometry_scalar; + + /// Execute a `GeoIntersects` array and read back its per-row booleans. + fn intersections(array: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult> { + Ok(array + .execute::(ctx)? + .into_bool() + .into_bit_buffer() + .iter() + .collect()) + } + + /// `ST_Intersects(point, polygon)` is point-in-polygon: a point inside the polygon intersects + /// it, a point outside does not. + #[test] + fn point_in_polygon() -> VortexResult<()> { + let session = vortex_array::array_session(); + let mut ctx = session.create_execution_ctx(); + + // A 4x4 square anchored at the origin, as a constant query polygon. + let square = polygon_column(vec![vec![vec![ + (0.0, 0.0), + (4.0, 0.0), + (4.0, 4.0), + (0.0, 4.0), + (0.0, 0.0), + ]]])?; + let square = ConstantArray::new(square.execute_scalar(0, &mut ctx)?, 2).into_array(); + + // (2,2) is inside the square; (5,5) is outside. + let points = point_column(vec![2.0, 5.0], vec![2.0, 5.0])?; + let result = GeoIntersects::try_new_array(points, square)?.into_array(); + + assert_eq!(intersections(result, &mut ctx)?, vec![true, false]); + Ok(()) + } + + /// The Q1 pushdown path: the polygon arrives as a `WellKnownBinary` constant (a folded geometry + /// literal), decoded to `geo_types` via the `wkb` crate in `geometries`. + #[test] + fn point_in_polygon_wkb_constant() -> VortexResult<()> { + let session = vortex_array::array_session(); + let mut ctx = session.create_execution_ctx(); + + // The same 4x4 square, but as a WKB literal rather than a native polygon. + let square = Geometry::Polygon(Polygon::new( + LineString::new(vec![ + Coord { x: 0.0, y: 0.0 }, + Coord { x: 4.0, y: 0.0 }, + Coord { x: 4.0, y: 4.0 }, + Coord { x: 0.0, y: 4.0 }, + Coord { x: 0.0, y: 0.0 }, + ]), + vec![], + )); + let square = ConstantArray::new(wkb_geometry_scalar(&square)?, 2).into_array(); + + let points = point_column(vec![2.0, 5.0], vec![2.0, 5.0])?; + let result = GeoIntersects::try_new_array(points, square)?.into_array(); + + assert_eq!(intersections(result, &mut ctx)?, vec![true, false]); + Ok(()) + } +} diff --git a/vortex-geo/src/scalar_fn/mod.rs b/vortex-geo/src/scalar_fn/mod.rs index 385208f1991..5a7880f8ae7 100644 --- a/vortex-geo/src/scalar_fn/mod.rs +++ b/vortex-geo/src/scalar_fn/mod.rs @@ -4,3 +4,4 @@ //! Geometry scalar functions over the [`Point`](crate::extension::Point) type. pub mod distance; +pub mod intersects; diff --git a/vortex-geo/src/test_harness.rs b/vortex-geo/src/test_harness.rs index 2e9e7f43c27..88b52d405aa 100644 --- a/vortex-geo/src/test_harness.rs +++ b/vortex-geo/src/test_harness.rs @@ -3,34 +3,105 @@ //! Shared test helpers for the geospatial extension types. +use geo_types::Geometry; use vortex_array::ArrayRef; use vortex_array::IntoArray; use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::ListArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::StructArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; use vortex_array::dtype::extension::ExtDType; use vortex_array::scalar::Scalar; +use vortex_array::validity::Validity; use vortex_error::VortexResult; +use vortex_error::vortex_err; use crate::extension::GeoMetadata; use crate::extension::Point; +use crate::extension::Polygon; +use crate::extension::WellKnownBinary; use crate::extension::coordinate::Coordinate; +use crate::extension::coordinate::Dimension; use crate::extension::coordinate::coordinate_from_struct; +use crate::extension::polygon_storage_dtype; -/// A `Point` column (CRS `EPSG:4326`) over the given x/y coordinates. -pub(crate) fn point_column(xs: Vec, ys: Vec) -> VortexResult { - let storage = StructArray::from_fields(&[ +/// The WGS 84 (`EPSG:4326`) metadata tagged onto test geometry columns. +fn wgs84() -> GeoMetadata { + GeoMetadata { + crs: Some("EPSG:4326".to_string()), + } +} + +/// A coordinate `Struct` over the parallel x/y buffers. +fn xy_struct(xs: Vec, ys: Vec) -> VortexResult { + Ok(StructArray::from_fields(&[ ("x", PrimitiveArray::from_iter(xs).into_array()), ("y", PrimitiveArray::from_iter(ys).into_array()), ])? - .into_array(); - let metadata = GeoMetadata { - crs: Some("EPSG:4326".to_string()), - }; - let dtype = ExtDType::::try_new(metadata, storage.dtype().clone())?; + .into_array()) +} + +/// A `Point` column (CRS `EPSG:4326`) over the given x/y coordinates. +pub(crate) fn point_column(xs: Vec, ys: Vec) -> VortexResult { + let storage = xy_struct(xs, ys)?; + let dtype = ExtDType::::try_new(wgs84(), storage.dtype().clone())?; Ok(ExtensionArray::new(dtype.erased(), storage).into_array()) } +/// A `Polygon` column (CRS `EPSG:4326`). Each polygon is a list of rings; each ring a list of +/// `(x, y)` vertices. Stored as `List>>`. +pub(crate) fn polygon_column(polygons: Vec>>) -> VortexResult { + let offset = |n: usize| i32::try_from(n).map_err(|_| vortex_err!("polygon offset overflow")); + + let (mut xs, mut ys) = (Vec::new(), Vec::new()); + let mut ring_offsets = vec![0i32]; + let mut polygon_offsets = vec![0i32]; + for rings in &polygons { + for ring in rings { + for &(x, y) in ring { + xs.push(x); + ys.push(y); + } + ring_offsets.push(offset(xs.len())?); + } + polygon_offsets.push(offset(ring_offsets.len() - 1)?); + } + + let rings = ListArray::try_new( + xy_struct(xs, ys)?, + PrimitiveArray::from_iter(ring_offsets).into_array(), + Validity::NonNullable, + )? + .into_array(); + let storage = ListArray::try_new( + rings, + PrimitiveArray::from_iter(polygon_offsets).into_array(), + Validity::NonNullable, + )? + .into_array(); + + let dtype = ExtDType::::try_new( + wgs84(), + polygon_storage_dtype(Dimension::Xy, Nullability::NonNullable), + )?; + Ok(ExtensionArray::try_new(dtype.erased(), storage)?.into_array()) +} + +/// A `WellKnownBinary` (WKB) geometry scalar (CRS `EPSG:4326`) — the form a folded geometry literal +/// arrives as when pushed down from DuckDB. +pub(crate) fn wkb_geometry_scalar(geometry: &Geometry) -> VortexResult { + let mut bytes = Vec::new(); + wkb::writer::write_geometry(&mut bytes, geometry, &wkb::writer::WriteOptions::default()) + .map_err(|e| vortex_err!("writing WKB failed: {e}"))?; + let dtype = ExtDType::::try_new(wgs84(), DType::Binary(Nullability::NonNullable))?; + Ok(Scalar::extension_ref( + dtype.erased(), + Scalar::binary(bytes, Nullability::NonNullable), + )) +} + /// Decode a [`Coordinate`] from an extension-typed point scalar (unwrapped to its coordinate /// storage) or a bare coordinate `Struct` scalar — used to read back a single point in assertions. pub(crate) fn coordinate_from_scalar(scalar: &Scalar) -> VortexResult { diff --git a/vortex-layout/src/layouts/zoned/writer.rs b/vortex-layout/src/layouts/zoned/writer.rs index 7ebca0104e9..317961f0826 100644 --- a/vortex-layout/src/layouts/zoned/writer.rs +++ b/vortex-layout/src/layouts/zoned/writer.rs @@ -25,6 +25,7 @@ use vortex_array::aggregate_fn::fns::min::Min; use vortex_array::aggregate_fn::fns::nan_count::NanCount; use vortex_array::aggregate_fn::fns::null_count::NullCount; use vortex_array::aggregate_fn::fns::sum::Sum; +use vortex_array::aggregate_fn::session::AggregateFnSessionExt; use vortex_array::dtype::DType; use vortex_error::VortexResult; use vortex_error::vortex_ensure; @@ -106,7 +107,7 @@ impl LayoutStrategy for ZonedStrategy { .options .aggregate_fns .clone() - .unwrap_or_else(|| default_zoned_aggregate_fns(stream.dtype())); + .unwrap_or_else(|| default_zoned_aggregate_fns(stream.dtype(), session)); let session = session.clone(); let stats_accumulator = Arc::new(Mutex::new(AggregateStatsAccumulator::new( @@ -178,7 +179,7 @@ impl LayoutStrategy for ZonedStrategy { } } -fn default_zoned_aggregate_fns(dtype: &DType) -> Arc<[AggregateFnRef]> { +fn default_zoned_aggregate_fns(dtype: &DType, session: &VortexSession) -> Arc<[AggregateFnRef]> { let (max, min) = match dtype { DType::Utf8(_) | DType::Binary(_) => ( BoundedMax.bind(BoundedMaxOptions { @@ -204,6 +205,10 @@ fn default_zoned_aggregate_fns(dtype: &DType) -> Arc<[AggregateFnRef]> { aggregate_fns.push(NanCount.bind(EmptyOptions)); aggregate_fns.push(NullCount.bind(EmptyOptions)); + // The builtin stats above are named directly. Stats from extension crates this one can't depend + // on (e.g. a geometry bounding box) are discovered from the registry at runtime instead. + aggregate_fns.extend(session.aggregate_fns().zone_stat_defaults(dtype)); + aggregate_fns.into() } @@ -223,7 +228,10 @@ mod tests { #[test] fn default_aggregates_bound_variable_length_min_max() { - let aggregate_fns = default_zoned_aggregate_fns(&DType::Utf8(Nullability::NonNullable)); + let aggregate_fns = default_zoned_aggregate_fns( + &DType::Utf8(Nullability::NonNullable), + &vortex_array::array_session(), + ); assert_eq!( aggregate_fns[0].as_::().max_bytes, @@ -237,7 +245,8 @@ mod tests { #[test] fn default_aggregates_keep_fixed_width_min_max_exact() { - let aggregate_fns = default_zoned_aggregate_fns(&PType::I32.into()); + let aggregate_fns = + default_zoned_aggregate_fns(&PType::I32.into(), &vortex_array::array_session()); assert!(aggregate_fns[0].is::()); assert!(aggregate_fns[1].is::()); @@ -249,7 +258,7 @@ mod tests { let dtype = DType::Extension( Timestamp::new(TimeUnit::Microseconds, Nullability::Nullable).erased(), ); - let aggregate_fns = default_zoned_aggregate_fns(&dtype); + let aggregate_fns = default_zoned_aggregate_fns(&dtype, &vortex_array::array_session()); assert!( aggregate_fns