fix remote storage support for deltalake (#331)

This commit is contained in:
QP Hou 2024-05-19 20:56:46 -07:00 committed by GitHub
parent 8168d93859
commit 1938fe6c2b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 34 additions and 11 deletions

View File

@ -3,14 +3,23 @@
set -eux
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
curl --fail-with-body https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o $HOME/minio-binaries/mc
chmod +x $HOME/minio-binaries/mc
TEST_DATA_DIR="${SCRIPT_DIR}/../../test_data"
$HOME/minio-binaries/mc alias set local http://127.0.0.1:9000 minioadmin minioadmin
$HOME/minio-binaries/mc mb local/test-data
$HOME/minio-binaries/mc cp "${TEST_DATA_DIR}"/blogs.parquet local/test-data
$HOME/minio-binaries/mc cp "${TEST_DATA_DIR}"/blogs.parquet "local/test-data/blogs space.parquet"
$HOME/minio-binaries/mc cp "${TEST_DATA_DIR}"/blogs.parquet local/test-data/blogs/
MCBIN=mc
if ! which "${MCBIN}" &>/dev/null; then
curl --fail-with-body https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o $HOME/minio-binaries/mc
chmod +x $HOME/minio-binaries/mc
MCBIN=$HOME/minio-binaries/mc
fi
"${MCBIN}" alias set local http://127.0.0.1:9000 minioadmin minioadmin
"${MCBIN}" mb local/test-data
"${MCBIN}" cp "${TEST_DATA_DIR}"/blogs.parquet local/test-data
"${MCBIN}" cp "${TEST_DATA_DIR}"/blogs.parquet "local/test-data/blogs space.parquet"
"${MCBIN}" cp "${TEST_DATA_DIR}"/blogs.parquet local/test-data/blogs/
# populate partitioned table in S3
"${MCBIN}" cp "${TEST_DATA_DIR}"/blogs.parquet local/test-data/partitioned_blogs/year=2024/month=10/
"${MCBIN}" cp "${TEST_DATA_DIR}"/blogs.parquet local/test-data/partitioned_blogs/year=2023/month=2/
# populate delta table
"${MCBIN}" cp --recursive "${TEST_DATA_DIR}"/blogs-delta local/test-data/

View File

@ -2,4 +2,9 @@
set -eux
docker run -d -p 9000:9000 quay.io/minio/minio:RELEASE.2023-08-23T10-07-06Z server /data
CTLBIN="docker"
if which podman &>/dev/null; then
CTLBIN="podman"
fi
${CTLBIN} run -d --name minio -p 9000:9000 quay.io/minio/minio:RELEASE.2023-08-23T10-07-06Z server /data

View File

@ -51,7 +51,7 @@ tokio-postgres = { version = "0.7.8", optional = true }
version = "0.17"
# git = "https://github.com/delta-io/delta-rs.git"
# rev = "63c14b3716428ff65e01404c6f7e62f341c98f05"
features = ["datafusion"]
features = ["datafusion", "s3", "gcs", "azure"]
default-features = false
[dependencies.connectorx]

View File

@ -1,6 +1,7 @@
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::Once;
use datafusion::arrow;
use datafusion::arrow::array::as_string_array;
@ -24,6 +25,8 @@ use crate::query;
use crate::table::TableIoSource;
use crate::table::{self, KeyValueSource, TableSource};
static START: Once = Once::new();
pub struct ColumnQ {
pub dfctx: SessionContext,
schema_map: HashMap<String, arrow::datatypes::SchemaRef>,
@ -40,6 +43,12 @@ impl ColumnQ {
}
pub fn new_with_config(config: SessionConfig) -> Self {
START.call_once(|| {
deltalake::aws::register_handlers(None);
deltalake::azure::register_handlers(None);
deltalake::gcp::register_handlers(None);
});
let config = config
.with_default_catalog_and_schema("roapi", "public")
// TODO: fix bug in datafusion to support partitioned table when