upgrade datafusion and arrow (#279)

* update arrow, datafusion, delta, convergence, object_store, sqlparser

* fix breaking code

* update to datafusion 23

* update connectorx fork

---------

Co-authored-by: Chitral Verma <chitralverma@gmail.com>
This commit is contained in:
QP Hou 2023-05-29 23:25:45 -07:00 committed by GitHub
parent ae13d3bc9e
commit df39a13d3f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 838 additions and 467 deletions

985
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -8,7 +8,3 @@ split-debuginfo = "unpacked"
[profile.release]
lto = true
codegen-units = 1
[patch.crates-io]
convergence = { git = "https://github.com/roapi/convergence.git", rev = "1af3f7ea76b0586362b332f8a1c30053aca58c2e" }
convergence-arrow = { git = "https://github.com/roapi/convergence.git", rev = "1af3f7ea76b0586362b332f8a1c30053aca58c2e" }

View File

@ -13,10 +13,10 @@ path = "src/lib.rs"
[dependencies]
# pulling arrow-schema manually to enable the serde feature.
# TODO: add serde feature in datafusion to avoid this workaround
arrow-schema = { version ="26", features = ["serde"] }
arrow-schema = { version ="37.0.0", features = ["serde"] }
datafusion = "14"
object_store = { version = "0.5.4", features = ["aws_profile", "gcp", "azure"] }
datafusion = "23"
object_store = { version = "0.5.6", features = ["aws_profile", "gcp", "azure"] }
percent-encoding = "2.2.0"
url = "2.2"
@ -24,7 +24,7 @@ log = "0"
regex = "1"
lazy_static = "1"
graphql-parser = "0"
sqlparser = "0.27" # version need to be in sync with convergence
sqlparser = "0.33" # version need to be in sync with convergence
yup-oauth2 = { version = "6.2", default-features = false, features = [
"service_account",
] }
@ -43,18 +43,18 @@ calamine = "0.19.1"
tokio = { version = "1", features = ["rt-multi-thread"] }
futures = "0.3"
hyper-tls = { version = "0.5.0", default-features = false, optional = true }
hyper-rustls = { version = "0.23.0", default-features = false, optional = true }
hyper-rustls = { version = "0.23.2", default-features = false, optional = true }
[dependencies.deltalake]
git = "https://github.com/delta-io/delta-rs.git"
rev = "d1e68cac9fc33b08dfb93260038f50c117c8534d"
rev = "72a9e5827e99c7d2a1cf05806ffce6f0a4449d47"
default-features = false
features = ["datafusion-ext"]
[dependencies.connectorx]
git = "https://github.com/sfu-db/connector-x.git"
rev = "962b396857979c813486ad842e91a0c21ce55f72"
version = "0.3.2-alpha.3"
git = "https://github.com/roapi/connector-x.git"
rev = "dd58b6a90d28b1ee7e62da859a5ba1d2d6c0b179"
version = "0.3.2-alpha.5"
features = ["default", "dst_arrow"]
optional = true
@ -72,7 +72,7 @@ default = ["rustls"]
rustls = [
"hyper-rustls",
"reqwest/rustls-tls",
"deltalake/s3-rustls",
"deltalake/s3",
"yup-oauth2/hyper-rustls",
]
native-tls-vendored = [

View File

@ -7,7 +7,7 @@ use std::sync::Arc;
use datafusion::arrow;
use datafusion::arrow::array::as_string_array;
use datafusion::arrow::array::StringArray;
use datafusion::datasource::object_store::{ObjectStoreProvider, ObjectStoreRegistry};
use datafusion::datasource::object_store::ObjectStoreRegistry;
use datafusion::error::{DataFusionError, Result as DatafusionResult};
pub use datafusion::execution::context::SessionConfig;
use datafusion::execution::context::SessionContext;
@ -20,11 +20,33 @@ use crate::table::{self, KeyValueSource, TableSource};
use object_store::aws::AmazonS3Builder;
use object_store::azure::MicrosoftAzureBuilder;
use object_store::gcp::GoogleCloudStorageBuilder;
use object_store::DynObjectStore;
use url::Url;
pub struct ColumnQObjectStoreProvider {}
impl ObjectStoreProvider for ColumnQObjectStoreProvider {
fn get_by_url(&self, url: &Url) -> DatafusionResult<Arc<dyn object_store::ObjectStore>> {
#[derive(Default)]
pub struct ColumnQObjectStoreRegistry {}
impl std::fmt::Debug for ColumnQObjectStoreRegistry {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("ColumnQObjectStoreRegistry").finish()
}
}
impl ColumnQObjectStoreRegistry {
pub fn get_by_url(&self, url: &Url) -> DatafusionResult<Arc<DynObjectStore>> {
self.get_store(url)
}
}
impl ObjectStoreRegistry for ColumnQObjectStoreRegistry {
fn register_store(
&self,
_url: &Url,
_store: Arc<DynObjectStore>,
) -> Option<Arc<DynObjectStore>> {
None
}
fn get_store(&self, url: &Url) -> DatafusionResult<Arc<DynObjectStore>> {
match url.host_str() {
None => Err(DataFusionError::Execution(format!(
"Missing bucket name: {}",
@ -79,13 +101,15 @@ pub struct ColumnQ {
impl ColumnQ {
pub fn new() -> Self {
Self::new_with_config(SessionConfig::from_env().with_information_schema(true))
Self::new_with_config(
SessionConfig::from_env()
.expect("Valid environment variables should be set to create SessionConfig")
.with_information_schema(true),
)
}
pub fn new_with_config(config: SessionConfig) -> Self {
let object_store_provider = ColumnQObjectStoreProvider {};
let object_store_registry =
ObjectStoreRegistry::new_with_provider(Some(Arc::new(object_store_provider)));
let object_store_registry = ColumnQObjectStoreRegistry::default();
let rn_config =
RuntimeConfig::new().with_object_store_registry(Arc::new(object_store_registry));
let runtime_env = RuntimeEnv::new(rn_config).unwrap();
@ -120,11 +144,11 @@ impl ColumnQ {
return Err(ColumnQError::invalid_kv_key_type());
}
let val_schema_idx = schema.index_of(&value)?;
let projections = Some(vec![key_schema_idx, val_schema_idx]);
let projections = vec![key_schema_idx, val_schema_idx];
let filters = &[];
let exec_plan = table
.scan(&self.dfctx.state(), &projections, filters, None)
.scan(&self.dfctx.state(), Some(&projections), filters, None)
.await?;
let batches = collect(exec_plan, self.dfctx.task_ctx()).await?;
let mut kv = HashMap::new();
@ -214,15 +238,14 @@ mod tests {
use std::{env, str::FromStr};
use tempfile::Builder;
use datafusion::datasource::object_store::ObjectStoreProvider;
use url::Url;
use super::ColumnQObjectStoreProvider;
use super::ColumnQObjectStoreRegistry;
#[test]
fn s3_object_store_type() {
let host_url = "s3://bucket_name/path";
let provider = ColumnQObjectStoreProvider {};
let provider = ColumnQObjectStoreRegistry {};
let err = provider
.get_by_url(&Url::from_str(host_url).unwrap())
@ -242,7 +265,7 @@ mod tests {
#[test]
fn s3_object_store_type_no_bucket() {
let host_url = "s3://";
let provider = ColumnQObjectStoreProvider {};
let provider = ColumnQObjectStoreRegistry {};
let err = provider
.get_by_url(&Url::from_str(host_url).unwrap())
@ -253,7 +276,7 @@ mod tests {
#[tokio::test]
async fn gcs_object_store_type() -> anyhow::Result<()> {
let host_url = "gs://bucket_name/path";
let provider = ColumnQObjectStoreProvider {};
let provider = ColumnQObjectStoreRegistry {};
let tmp_dir = Builder::new().prefix("columnq.test.gcs").tempdir()?;
let tmp_gcs_path = tmp_dir.path().join("service_account.json");
@ -280,7 +303,7 @@ mod tests {
#[test]
fn azure_object_store_type() {
let host_url = "az://bucket_name/path";
let provider = ColumnQObjectStoreProvider {};
let provider = ColumnQObjectStoreRegistry {};
// https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio#http-connection-strings
env::set_var("AZURE_STORAGE_ACCOUNT_NAME", "devstoreaccount1");
env::set_var("AZURE_STORAGE_ACCOUNT_KEY", "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==");
@ -299,7 +322,7 @@ mod tests {
#[test]
fn unknown_object_store_type() {
let unknown = "unknown://bucket_name/path";
let provider = ColumnQObjectStoreProvider {};
let provider = ColumnQObjectStoreRegistry {};
let err = provider
.get_by_url(&Url::from_str(unknown).unwrap())
.unwrap_err();

View File

@ -42,11 +42,11 @@ mod tests {
serde_json::json!([
{
"d32": "1970-01-02",
"d64": "1970-01-01",
"d64": "1970-01-01T00:00:00.001",
},
{
"d32": "2021-04-12",
"d64": "2021-04-12",
"d64": "2021-04-12T04:04:28",
},
])
.to_string(),
@ -100,16 +100,16 @@ mod tests {
std::str::from_utf8(&data).unwrap(),
serde_json::json!([
{
"sec": "2021-04-12 04:04:28",
"msec": "2021-04-12 04:04:28",
"usec": "2021-04-12 04:04:28",
"nsec": "2021-04-12 04:04:28",
"sec": "2021-04-12T04:04:28",
"msec": "2021-04-12T04:04:28",
"usec": "2021-04-12T04:04:28",
"nsec": "2021-04-12T04:04:28",
},
{
"sec": "2021-05-12 04:04:28",
"msec": "2021-05-12 04:04:28.001",
"usec": "2021-05-12 04:04:28.000002",
"nsec": "2021-05-12 04:04:28.000000003",
"sec": "2021-05-12T04:04:28",
"msec": "2021-05-12T04:04:28.001",
"usec": "2021-05-12T04:04:28.000002",
"nsec": "2021-05-12T04:04:28.000000003",
}
])
.to_string(),

View File

@ -1,7 +1,6 @@
use crate::columnq::ColumnQObjectStoreProvider;
use crate::columnq::ColumnQObjectStoreRegistry;
use crate::error::ColumnQError;
use crate::table::TableSource;
use datafusion::datasource::object_store::ObjectStoreProvider;
use futures::TryStreamExt;
use object_store::ObjectStore;
use percent_encoding;
@ -27,7 +26,7 @@ where
I: Iterator<Item = &'a str>,
F: FnMut(std::io::Cursor<Vec<u8>>) -> Result<T, ColumnQError>,
{
let object_store_provider = ColumnQObjectStoreProvider {};
let object_store_provider = ColumnQObjectStoreRegistry {};
let mut partitions = vec![];
for path_str in path_iter {
@ -49,7 +48,7 @@ pub async fn partitions_from_uri<'a, F, T>(
where
F: FnMut(std::io::Cursor<Vec<u8>>) -> Result<T, ColumnQError>,
{
let object_store_provider = ColumnQObjectStoreProvider {};
let object_store_provider = ColumnQObjectStoreRegistry {};
let url = &Url::from_str(t.get_uri_str()).unwrap();
let client = object_store_provider.get_by_url(url)?;
let mut partitions = vec![];

View File

@ -1,5 +1,4 @@
use std::convert::TryFrom;
use std::sync::Arc;
use datafusion::arrow;
use datafusion::logical_expr::Operator;
@ -40,9 +39,7 @@ fn invalid_query(message: String) -> QueryError {
// convert order list from graphql argument to datafusion sort columns
//
// sort order matters, thus it's modeled as a list
fn to_datafusion_sort_columns<'b>(
sort_columns: &[Value<'b, &'b str>],
) -> Result<Vec<Expr>, QueryError> {
fn to_datafusion_sort_columns(sort_columns: &[Value<String>]) -> Result<Vec<Expr>, QueryError> {
sort_columns
.iter()
.map(|optval| match optval {
@ -82,7 +79,7 @@ fn to_datafusion_sort_columns<'b>(
.collect()
}
fn operand_to_datafusion_expr<'b>(operand: &Value<'b, &'b str>) -> Result<Expr, QueryError> {
fn operand_to_datafusion_expr(operand: &Value<String>) -> Result<Expr, QueryError> {
match operand {
Value::Boolean(b) => Ok(Expr::Literal(ScalarValue::Boolean(Some(*b)))),
Value::String(s) => Ok(Expr::Literal(ScalarValue::Utf8(Some(s.to_string())))),
@ -117,17 +114,14 @@ fn operand_to_datafusion_expr<'b>(operand: &Value<'b, &'b str>) -> Result<Expr,
// col4
// }
// ```
fn to_datafusion_predicates<'b>(
col: &str,
filter: &Value<'b, &'b str>,
) -> Result<Vec<Expr>, QueryError> {
fn to_datafusion_predicates(col: &str, filter: &Value<String>) -> Result<Vec<Expr>, QueryError> {
match filter {
Value::Object(obj) => obj
.iter()
.map(|(op, operand)| {
let col_expr = Expr::Column(Column::from_name(col.to_string()));
let right_expr = operand_to_datafusion_expr(operand)?;
match *op {
match op.as_str() {
"eq" => Ok(binary_expr(col_expr, Operator::Eq, right_expr)),
"lt" => Ok(binary_expr(col_expr, Operator::Lt, right_expr)),
"lte" | "lteq" => Ok(binary_expr(col_expr, Operator::LtEq, right_expr)),
@ -153,11 +147,11 @@ fn to_datafusion_predicates<'b>(
}
}
pub fn query_to_df(
pub async fn query_to_df(
dfctx: &datafusion::execution::context::SessionContext,
q: &str,
) -> Result<Arc<datafusion::dataframe::DataFrame>, QueryError> {
let doc = parse_query::<&str>(q)?;
) -> Result<datafusion::dataframe::DataFrame, QueryError> {
let doc = parse_query::<String>(q)?;
let def = match doc.definitions.len() {
1 => match &doc.definitions[0] {
@ -228,15 +222,16 @@ pub fn query_to_df(
let field = field.ok_or_else(|| invalid_query("field not found in selection".to_string()))?;
let mut df = dfctx
.table(field.name)
.map_err(|e| QueryError::invalid_table(e, field.name))?;
.table(field.name.as_str())
.await
.map_err(|e| QueryError::invalid_table(e, field.name.as_str()))?;
let mut filter = None;
let mut sort = None;
let mut limit = None;
let mut page = None;
for (key, value) in &field.arguments {
match *key {
match key.as_str() {
"filter" => {
filter = Some(value);
}
@ -277,7 +272,7 @@ pub fn query_to_df(
.items
.iter()
.map(|selection| match selection {
Selection::Field(f) => Ok(f.name),
Selection::Field(f) => Ok(f.name.as_str()),
_ => Err(QueryError {
error: "invalid graphql query".to_string(),
message: "selection set in query should only contain Fields".to_string(),
@ -352,7 +347,8 @@ pub async fn exec_query(
dfctx: &datafusion::execution::context::SessionContext,
q: &str,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
query_to_df(dfctx, q)?
query_to_df(dfctx, q)
.await?
.collect()
.await
.map_err(QueryError::query_exec)
@ -367,8 +363,8 @@ mod tests {
use super::*;
use crate::test_util::*;
#[test]
fn simple_query_planning() -> anyhow::Result<()> {
#[tokio::test]
async fn simple_query_planning() -> anyhow::Result<()> {
let mut dfctx = SessionContext::new();
register_table_properties(&mut dfctx)?;
@ -386,21 +382,23 @@ mod tests {
bath
}
}"#,
)?;
)
.await?;
let expected_df = dfctx
.table("properties")?
.table("properties")
.await?
.filter(col("bath").gt_eq(lit(2i64)))?
.filter(col("bed").gt(lit(3i64)))?
.select(vec![col("address"), col("bed"), col("bath")])?;
assert_eq_df(df, expected_df);
assert_eq_df(df.into(), expected_df.into());
Ok(())
}
#[test]
fn consistent_and_deterministics_logical_plan() -> anyhow::Result<()> {
#[tokio::test]
async fn consistent_and_deterministics_logical_plan() -> anyhow::Result<()> {
let mut dfctx = SessionContext::new();
register_table_properties(&mut dfctx)?;
@ -420,16 +418,18 @@ mod tests {
bed
}
}"#,
)?;
)
.await?;
let expected_df = dfctx
.table("properties")?
.table("properties")
.await?
.filter(col("bed").gt(lit(3i64)))?
.select(vec![col("address"), col("bed")])?
.sort(vec![column_sort_expr_asc("bed")])?
.limit(0, Some(10))?;
assert_eq_df(df, expected_df);
assert_eq_df(df.into(), expected_df.into());
Ok(())
}
@ -460,18 +460,12 @@ mod tests {
assert_eq!(
batch.column(0).as_ref(),
Arc::new(StringArray::from(vec!["Kenmore, WA", "Fremont, WA",])).as_ref(),
&StringArray::from(vec!["Kenmore, WA", "Fremont, WA",]),
);
assert_eq!(
batch.column(1).as_ref(),
Arc::new(Int64Array::from(vec![4, 5])).as_ref(),
);
assert_eq!(batch.column(1).as_ref(), &Int64Array::from(vec![4, 5]),);
assert_eq!(
batch.column(2).as_ref(),
Arc::new(Int64Array::from(vec![3, 3])).as_ref(),
);
assert_eq!(batch.column(2).as_ref(), &Int64Array::from(vec![3, 3]),);
Ok(())
}

View File

@ -1,19 +1,20 @@
use datafusion::logical_expr::expr::Sort;
use datafusion::prelude::{Column, Expr};
pub fn column_sort_expr_desc(column: String) -> Expr {
Expr::Sort {
Expr::Sort(Sort {
expr: Box::new(Expr::Column(Column::from_name(column))),
asc: false,
nulls_first: true,
}
})
}
pub fn column_sort_expr_asc(column: impl Into<String>) -> Expr {
Expr::Sort {
Expr::Sort(Sort {
expr: Box::new(Expr::Column(Column::from_name(column))),
asc: true,
nulls_first: true,
}
})
}
pub mod graphql;

View File

@ -1,5 +1,4 @@
use std::collections::HashMap;
use std::sync::Arc;
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::logical_expr::Operator;
@ -54,11 +53,11 @@ fn num_parse_err(e: std::num::ParseIntError) -> QueryError {
}
}
pub fn table_query_to_df(
pub async fn table_query_to_df(
dfctx: &datafusion::execution::context::SessionContext,
table_name: &str,
params: &HashMap<String, String>,
) -> Result<Arc<datafusion::dataframe::DataFrame>, QueryError> {
) -> Result<datafusion::dataframe::DataFrame, QueryError> {
lazy_static! {
static ref RE_REST_FILTER: Regex =
Regex::new(r"filter\[(?P<column>.+)\](?P<op>.+)?").unwrap();
@ -66,6 +65,7 @@ pub fn table_query_to_df(
let mut df = dfctx
.table(table_name)
.await
.map_err(|e| QueryError::invalid_table(e, table_name))?;
// filter[col1]eq='foo'
@ -163,7 +163,7 @@ pub async fn query_table(
table_name: &str,
params: &HashMap<String, String>,
) -> Result<Vec<RecordBatch>, QueryError> {
let df = table_query_to_df(dfctx, table_name, params)?;
let df = table_query_to_df(dfctx, table_name, params).await?;
df.collect().await.map_err(QueryError::query_exec)
}
@ -188,18 +188,20 @@ mod tests {
params.insert("columns".to_string(), "ami_id,version".to_string());
params.insert("filter[arch]".to_string(), "'amd64'".to_string());
let df = table_query_to_df(&dfctx, "ubuntu_ami", &params)?;
let df = table_query_to_df(&dfctx, "ubuntu_ami", &params).await?;
assert_eq_df(
df,
df.into(),
dfctx
.table("ubuntu_ami")?
.table("ubuntu_ami")
.await?
.filter(
col("arch").eq(Expr::Literal(ScalarValue::Utf8(Some("amd64".to_string())))),
)?
.select(vec![col("ami_id"), col("version")])?
.sort(vec![column_sort_expr_asc("ami_id")])?
.limit(0, Some(10))?,
.limit(0, Some(10))?
.into(),
);
Ok(())
@ -221,7 +223,7 @@ mod tests {
let batch = &batches[0];
assert_eq!(
batch.column(0).as_ref(),
Arc::new(StringArray::from(vec!["<a href=\"https://console.aws.amazon.com/ec2/home?region=us-east-2#launchAmi=ami-091a87cd1ff23d97c\">ami-091a87cd1ff23d97c</a>"])).as_ref(),
&StringArray::from(vec!["<a href=\"https://console.aws.amazon.com/ec2/home?region=us-east-2#launchAmi=ami-091a87cd1ff23d97c\">ami-091a87cd1ff23d97c</a>"]),
);
Ok(())

View File

@ -1,5 +1,3 @@
use std::sync::Arc;
use datafusion::arrow;
use crate::error::QueryError;
@ -9,12 +7,12 @@ pub async fn exec_query(
sql: &str,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
let plan = dfctx
.state()
.create_logical_plan(sql)
.await
.map_err(QueryError::plan_sql)?;
let df: Arc<datafusion::dataframe::DataFrame> = Arc::new(
datafusion::dataframe::DataFrame::new(dfctx.state.clone(), &plan),
);
let df = datafusion::dataframe::DataFrame::new(dfctx.state(), plan);
df.collect().await.map_err(QueryError::query_exec)
}
@ -47,15 +45,12 @@ mod tests {
assert_eq!(
batch.column(0).as_ref(),
Arc::new(StringArray::from(vec![
"Carl", "Daniel", "Mike", "Roger", "Sam",
]))
.as_ref(),
&StringArray::from(vec!["Carl", "Daniel", "Mike", "Roger", "Sam",]),
);
assert_eq!(
batch.column(1).as_ref(),
Arc::new(Int64Array::from(vec![3, 3, 4, 3, 2])).as_ref(),
&Int64Array::from(vec![3, 3, 4, 3, 2]),
);
Ok(())

View File

@ -79,7 +79,7 @@ mod tests {
.await?;
let ctx = SessionContext::new();
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
assert_eq!(stats.num_rows, Some(37 * 3));
Ok(())
@ -92,7 +92,7 @@ mod tests {
let t = to_mem_table(&TableSource::new("uk_cities".to_string(), test_path)).await?;
let ctx = SessionContext::new();
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
assert_eq!(stats.num_rows, Some(37));
Ok(())

View File

@ -79,7 +79,7 @@ mod tests {
.await?;
let ctx = SessionContext::new();
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
assert_eq!(stats.num_rows, Some(37 * 3));
Ok(())
@ -92,7 +92,7 @@ mod tests {
let t = to_mem_table(&TableSource::new("uk_cities".to_string(), test_path)).await?;
let ctx = SessionContext::new();
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
assert_eq!(stats.num_rows, Some(37));
Ok(())

View File

@ -132,7 +132,7 @@ mod tests {
.await?;
let ctx = SessionContext::new();
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
assert_eq!(stats.num_rows, Some(37 * 3));
Ok(())
@ -155,7 +155,7 @@ c1,c2,c3
let t = to_mem_table(&source).await?;
let ctx = SessionContext::new();
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
assert_eq!(stats.num_rows, Some(3));
Ok(())

View File

@ -140,7 +140,7 @@ mod tests {
.await?;
let ctx = SessionContext::new();
validate_statistics(t.scan(&ctx.state(), &None, &[], None).await?.statistics());
validate_statistics(t.scan(&ctx.state(), None, &[], None).await?.statistics());
match t.as_any().downcast_ref::<MemTable>() {
Some(_) => Ok(()),

View File

@ -141,7 +141,7 @@ fn infer_schema(rows: &[Vec<String>]) -> Schema {
let dt = dt_iter.fold(dt_init, coerce_type);
// normalize column name by replacing space with under score
Field::new(&col_name.replace(' ', "_"), dt, true)
Field::new(col_name.replace(' ', "_"), dt, true)
})
.collect();
Schema::new(fields)

View File

@ -3,6 +3,7 @@ use std::sync::Arc;
use datafusion::arrow;
use datafusion::arrow::datatypes::Schema;
#[allow(deprecated)]
use datafusion::arrow::json::reader::{Decoder, DecoderOptions};
use datafusion::arrow::record_batch::RecordBatch;
use serde_json::value::Value;
@ -55,6 +56,8 @@ fn json_vec_to_partition(
};
// decode to arrow record batch
#[allow(deprecated)]
// TODO: switch to RawDecoder
let decoder = Decoder::new(
Arc::new(schema.clone()),
DecoderOptions::new().with_batch_size(batch_size),

View File

@ -704,7 +704,7 @@ batch_size: 512
let ctx = datafusion::prelude::SessionContext::new();
let table = load(&t, &ctx).await?;
let stats = table
.scan(&ctx.state(), &None, &[], None)
.scan(&ctx.state(), None, &[], None)
.await?
.statistics();
assert_eq!(stats.num_rows, Some(37));
@ -726,7 +726,7 @@ uri: "sqlite://../test_data/sqlite/sample.{}"
let ctx = datafusion::prelude::SessionContext::new();
let table = load(&t, &ctx).await?;
let stats = table
.scan(&ctx.state(), &None, &[], None)
.scan(&ctx.state(), None, &[], None)
.await?
.statistics();
assert_eq!(stats.num_rows, Some(37));

View File

@ -2,6 +2,7 @@ use std::io::{BufReader, Read};
use std::sync::Arc;
use datafusion::arrow::datatypes::{Schema, SchemaRef};
#[allow(deprecated)]
use datafusion::arrow::json::reader::{infer_json_schema, Decoder, DecoderOptions, ValueIter};
use datafusion::arrow::record_batch::RecordBatch;
@ -18,6 +19,8 @@ fn decode_json_from_reader<R: Read>(
schema_ref: SchemaRef,
batch_size: usize,
) -> Result<Vec<RecordBatch>, ColumnQError> {
#[allow(deprecated)]
// TODO: switch to RawDecoder
let decoder = Decoder::new(
schema_ref,
DecoderOptions::new().with_batch_size(batch_size),

View File

@ -121,7 +121,7 @@ mod tests {
.unwrap();
let stats = t
.scan(&ctx.state(), &None, &[], None)
.scan(&ctx.state(), None, &[], None)
.await
.unwrap()
.statistics();
@ -146,7 +146,7 @@ mod tests {
.await?;
let ctx = SessionContext::new();
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
assert_eq!(stats.num_rows, Some(500));
Ok(())
@ -171,7 +171,7 @@ mod tests {
.await?;
let ctx = SessionContext::new();
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
assert_eq!(stats.num_rows, Some(1500));
Ok(())

View File

@ -64,7 +64,7 @@ fn infer_schema(r: &Range<calamine::DataType>) -> Result<Schema, ColumnQError> {
let mut dt_iter = set.iter().cloned();
let dt = dt_iter.next().unwrap_or(DataType::Utf8);
Field::new(&col_name.replace(' ', "_"), dt, true)
Field::new(col_name.replace(' ', "_"), dt, true)
})
.collect();
Ok(Schema::new(fields))
@ -182,7 +182,7 @@ sheet_name = "uk_cities_with_headers"
let t = to_mem_table(&table_source).await.unwrap();
let ctx = SessionContext::new();
let stats = t
.scan(&ctx.state(), &None, &[], None)
.scan(&ctx.state(), None, &[], None)
.await
.unwrap()
.statistics();
@ -207,7 +207,7 @@ option:
let t = to_mem_table(&table_source).await.unwrap();
let ctx = SessionContext::new();
let stats = t
.scan(&ctx.state(), &None, &[], None)
.scan(&ctx.state(), None, &[], None)
.await
.unwrap()
.statistics();

View File

@ -137,7 +137,7 @@ pub async fn register_table_ubuntu_ami(dfctx: &mut SessionContext) -> anyhow::Re
pub fn assert_eq_df(df1: Arc<DataFrame>, df2: Arc<DataFrame>) {
assert_eq!(
format!("{:?}", df1.to_logical_plan()),
format!("{:?}", df2.to_logical_plan())
format!("{:?}", (*df1).clone().into_optimized_plan()),
format!("{:?}", (*df2).clone().into_optimized_plan())
);
}

View File

@ -15,7 +15,7 @@ mod mysql {
let t = DatabaseLoader::MySQL
.to_mem_table(&TableSource::new(name, env::var("MYSQL_URL")?))?;
let ctx = SessionContext::new();
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
assert!(stats.num_rows.is_some());
}

View File

@ -40,8 +40,8 @@ clap = { version = "3", features = ["color"] }
thiserror = "1"
anyhow = "1"
convergence = "0.9"
convergence-arrow = "0.9"
convergence = "0.11.0"
convergence-arrow = "0.11.0"
[features]
default = ["rustls", "snmalloc"]

View File

@ -1,8 +1,7 @@
use std::collections::HashMap;
use std::sync::Arc;
use async_trait::async_trait;
use columnq::datafusion::arrow;
use columnq::arrow::record_batch::RecordBatch;
use columnq::datafusion::dataframe::DataFrame;
use columnq::datafusion::error::DataFusionError;
use columnq::encoding;
@ -61,25 +60,19 @@ pub trait RoapiContext: Send + Sync + 'static {
async fn table_schema_json_bytes(&self, table_name: &str) -> Result<Vec<u8>, ApiErrResp>;
async fn query_graphql(
&self,
query: &str,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError>;
async fn query_graphql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError>;
async fn query_sql(
&self,
query: &str,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError>;
async fn query_sql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError>;
async fn query_rest_table(
&self,
table_name: &str,
params: &HashMap<String, String>,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError>;
) -> Result<Vec<RecordBatch>, QueryError>;
async fn kv_get(&self, kv_name: &str, key: &str) -> Result<Option<String>, QueryError>;
async fn sql_to_df(&self, query: &str) -> Result<Arc<DataFrame>, DataFusionError>;
async fn sql_to_df(&self, query: &str) -> Result<DataFrame, DataFusionError>;
async fn get_response_format(&self) -> encoding::ContentType;
}
@ -119,18 +112,12 @@ impl RoapiContext for RawRoapiContext {
}
#[inline]
async fn query_graphql(
&self,
query: &str,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
async fn query_graphql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError> {
self.cq.query_graphql(query).await
}
#[inline]
async fn query_sql(
&self,
query: &str,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
async fn query_sql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError> {
self.cq.query_sql(query).await
}
@ -139,7 +126,7 @@ impl RoapiContext for RawRoapiContext {
&self,
table_name: &str,
params: &HashMap<String, String>,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
) -> Result<Vec<RecordBatch>, QueryError> {
self.cq.query_rest_table(table_name, params).await
}
@ -149,7 +136,7 @@ impl RoapiContext for RawRoapiContext {
}
#[inline]
async fn sql_to_df(&self, query: &str) -> Result<Arc<DataFrame>, DataFusionError> {
async fn sql_to_df(&self, query: &str) -> Result<DataFrame, DataFusionError> {
self.cq.dfctx.sql(query).await
}
@ -195,19 +182,13 @@ impl RoapiContext for ConcurrentRoapiContext {
}
#[inline]
async fn query_graphql(
&self,
query: &str,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
async fn query_graphql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError> {
let ctx = self.read().await;
ctx.cq.query_graphql(query).await
}
#[inline]
async fn query_sql(
&self,
query: &str,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
async fn query_sql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError> {
let ctx = self.read().await;
ctx.cq.query_sql(query).await
}
@ -217,7 +198,7 @@ impl RoapiContext for ConcurrentRoapiContext {
&self,
table_name: &str,
params: &HashMap<String, String>,
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
) -> Result<Vec<RecordBatch>, QueryError> {
let ctx = self.read().await;
ctx.cq.query_rest_table(table_name, params).await
}
@ -229,7 +210,7 @@ impl RoapiContext for ConcurrentRoapiContext {
}
#[inline]
async fn sql_to_df(&self, query: &str) -> Result<Arc<DataFrame>, DataFusionError> {
async fn sql_to_df(&self, query: &str) -> Result<DataFrame, DataFusionError> {
let ctx = self.read().await;
ctx.cq.dfctx.sql(query).await
}

View File

@ -20,18 +20,19 @@ use crate::context::RoapiContext;
use crate::server::RunnableServer;
fn df_err_to_sql(err: DataFusionError) -> ErrorResponse {
ErrorResponse::error(SqlState::DATA_EXCEPTION, err.to_string())
ErrorResponse::error(SqlState::DataException, err.to_string())
}
/// A portal built using a logical DataFusion query plan.
pub struct DataFusionPortal {
df: Arc<DataFrame>,
df: DataFrame,
}
#[async_trait]
impl Portal for DataFusionPortal {
async fn fetch(&mut self, batch: &mut DataRowBatch) -> Result<(), ErrorResponse> {
for arrow_batch in self.df.collect().await.map_err(df_err_to_sql)? {
let arrow_batches = self.df.clone().collect().await.map_err(df_err_to_sql)?;
for arrow_batch in arrow_batches {
record_batch_to_rows(&arrow_batch, batch)?;
}
Ok(())