mirror of
https://github.com/roapi/roapi.git
synced 2026-06-05 21:04:02 +08:00
upgrade datafusion and arrow (#279)
* update arrow, datafusion, delta, convergence, object_store, sqlparser * fix breaking code * update to datafusion 23 * update connectorx fork --------- Co-authored-by: Chitral Verma <chitralverma@gmail.com>
This commit is contained in:
parent
ae13d3bc9e
commit
df39a13d3f
985
Cargo.lock
generated
985
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -8,7 +8,3 @@ split-debuginfo = "unpacked"
|
||||
[profile.release]
|
||||
lto = true
|
||||
codegen-units = 1
|
||||
|
||||
[patch.crates-io]
|
||||
convergence = { git = "https://github.com/roapi/convergence.git", rev = "1af3f7ea76b0586362b332f8a1c30053aca58c2e" }
|
||||
convergence-arrow = { git = "https://github.com/roapi/convergence.git", rev = "1af3f7ea76b0586362b332f8a1c30053aca58c2e" }
|
||||
|
||||
@ -13,10 +13,10 @@ path = "src/lib.rs"
|
||||
[dependencies]
|
||||
# pulling arrow-schema manually to enable the serde feature.
|
||||
# TODO: add serde feature in datafusion to avoid this workaround
|
||||
arrow-schema = { version ="26", features = ["serde"] }
|
||||
arrow-schema = { version ="37.0.0", features = ["serde"] }
|
||||
|
||||
datafusion = "14"
|
||||
object_store = { version = "0.5.4", features = ["aws_profile", "gcp", "azure"] }
|
||||
datafusion = "23"
|
||||
object_store = { version = "0.5.6", features = ["aws_profile", "gcp", "azure"] }
|
||||
percent-encoding = "2.2.0"
|
||||
url = "2.2"
|
||||
|
||||
@ -24,7 +24,7 @@ log = "0"
|
||||
regex = "1"
|
||||
lazy_static = "1"
|
||||
graphql-parser = "0"
|
||||
sqlparser = "0.27" # version need to be in sync with convergence
|
||||
sqlparser = "0.33" # version need to be in sync with convergence
|
||||
yup-oauth2 = { version = "6.2", default-features = false, features = [
|
||||
"service_account",
|
||||
] }
|
||||
@ -43,18 +43,18 @@ calamine = "0.19.1"
|
||||
tokio = { version = "1", features = ["rt-multi-thread"] }
|
||||
futures = "0.3"
|
||||
hyper-tls = { version = "0.5.0", default-features = false, optional = true }
|
||||
hyper-rustls = { version = "0.23.0", default-features = false, optional = true }
|
||||
hyper-rustls = { version = "0.23.2", default-features = false, optional = true }
|
||||
|
||||
[dependencies.deltalake]
|
||||
git = "https://github.com/delta-io/delta-rs.git"
|
||||
rev = "d1e68cac9fc33b08dfb93260038f50c117c8534d"
|
||||
rev = "72a9e5827e99c7d2a1cf05806ffce6f0a4449d47"
|
||||
default-features = false
|
||||
features = ["datafusion-ext"]
|
||||
|
||||
[dependencies.connectorx]
|
||||
git = "https://github.com/sfu-db/connector-x.git"
|
||||
rev = "962b396857979c813486ad842e91a0c21ce55f72"
|
||||
version = "0.3.2-alpha.3"
|
||||
git = "https://github.com/roapi/connector-x.git"
|
||||
rev = "dd58b6a90d28b1ee7e62da859a5ba1d2d6c0b179"
|
||||
version = "0.3.2-alpha.5"
|
||||
features = ["default", "dst_arrow"]
|
||||
optional = true
|
||||
|
||||
@ -72,7 +72,7 @@ default = ["rustls"]
|
||||
rustls = [
|
||||
"hyper-rustls",
|
||||
"reqwest/rustls-tls",
|
||||
"deltalake/s3-rustls",
|
||||
"deltalake/s3",
|
||||
"yup-oauth2/hyper-rustls",
|
||||
]
|
||||
native-tls-vendored = [
|
||||
|
||||
@ -7,7 +7,7 @@ use std::sync::Arc;
|
||||
use datafusion::arrow;
|
||||
use datafusion::arrow::array::as_string_array;
|
||||
use datafusion::arrow::array::StringArray;
|
||||
use datafusion::datasource::object_store::{ObjectStoreProvider, ObjectStoreRegistry};
|
||||
use datafusion::datasource::object_store::ObjectStoreRegistry;
|
||||
use datafusion::error::{DataFusionError, Result as DatafusionResult};
|
||||
pub use datafusion::execution::context::SessionConfig;
|
||||
use datafusion::execution::context::SessionContext;
|
||||
@ -20,11 +20,33 @@ use crate::table::{self, KeyValueSource, TableSource};
|
||||
use object_store::aws::AmazonS3Builder;
|
||||
use object_store::azure::MicrosoftAzureBuilder;
|
||||
use object_store::gcp::GoogleCloudStorageBuilder;
|
||||
use object_store::DynObjectStore;
|
||||
use url::Url;
|
||||
|
||||
pub struct ColumnQObjectStoreProvider {}
|
||||
impl ObjectStoreProvider for ColumnQObjectStoreProvider {
|
||||
fn get_by_url(&self, url: &Url) -> DatafusionResult<Arc<dyn object_store::ObjectStore>> {
|
||||
#[derive(Default)]
|
||||
pub struct ColumnQObjectStoreRegistry {}
|
||||
|
||||
impl std::fmt::Debug for ColumnQObjectStoreRegistry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
f.debug_struct("ColumnQObjectStoreRegistry").finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl ColumnQObjectStoreRegistry {
|
||||
pub fn get_by_url(&self, url: &Url) -> DatafusionResult<Arc<DynObjectStore>> {
|
||||
self.get_store(url)
|
||||
}
|
||||
}
|
||||
impl ObjectStoreRegistry for ColumnQObjectStoreRegistry {
|
||||
fn register_store(
|
||||
&self,
|
||||
_url: &Url,
|
||||
_store: Arc<DynObjectStore>,
|
||||
) -> Option<Arc<DynObjectStore>> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_store(&self, url: &Url) -> DatafusionResult<Arc<DynObjectStore>> {
|
||||
match url.host_str() {
|
||||
None => Err(DataFusionError::Execution(format!(
|
||||
"Missing bucket name: {}",
|
||||
@ -79,13 +101,15 @@ pub struct ColumnQ {
|
||||
|
||||
impl ColumnQ {
|
||||
pub fn new() -> Self {
|
||||
Self::new_with_config(SessionConfig::from_env().with_information_schema(true))
|
||||
Self::new_with_config(
|
||||
SessionConfig::from_env()
|
||||
.expect("Valid environment variables should be set to create SessionConfig")
|
||||
.with_information_schema(true),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn new_with_config(config: SessionConfig) -> Self {
|
||||
let object_store_provider = ColumnQObjectStoreProvider {};
|
||||
let object_store_registry =
|
||||
ObjectStoreRegistry::new_with_provider(Some(Arc::new(object_store_provider)));
|
||||
let object_store_registry = ColumnQObjectStoreRegistry::default();
|
||||
let rn_config =
|
||||
RuntimeConfig::new().with_object_store_registry(Arc::new(object_store_registry));
|
||||
let runtime_env = RuntimeEnv::new(rn_config).unwrap();
|
||||
@ -120,11 +144,11 @@ impl ColumnQ {
|
||||
return Err(ColumnQError::invalid_kv_key_type());
|
||||
}
|
||||
let val_schema_idx = schema.index_of(&value)?;
|
||||
let projections = Some(vec![key_schema_idx, val_schema_idx]);
|
||||
let projections = vec![key_schema_idx, val_schema_idx];
|
||||
|
||||
let filters = &[];
|
||||
let exec_plan = table
|
||||
.scan(&self.dfctx.state(), &projections, filters, None)
|
||||
.scan(&self.dfctx.state(), Some(&projections), filters, None)
|
||||
.await?;
|
||||
let batches = collect(exec_plan, self.dfctx.task_ctx()).await?;
|
||||
let mut kv = HashMap::new();
|
||||
@ -214,15 +238,14 @@ mod tests {
|
||||
use std::{env, str::FromStr};
|
||||
use tempfile::Builder;
|
||||
|
||||
use datafusion::datasource::object_store::ObjectStoreProvider;
|
||||
use url::Url;
|
||||
|
||||
use super::ColumnQObjectStoreProvider;
|
||||
use super::ColumnQObjectStoreRegistry;
|
||||
|
||||
#[test]
|
||||
fn s3_object_store_type() {
|
||||
let host_url = "s3://bucket_name/path";
|
||||
let provider = ColumnQObjectStoreProvider {};
|
||||
let provider = ColumnQObjectStoreRegistry {};
|
||||
|
||||
let err = provider
|
||||
.get_by_url(&Url::from_str(host_url).unwrap())
|
||||
@ -242,7 +265,7 @@ mod tests {
|
||||
#[test]
|
||||
fn s3_object_store_type_no_bucket() {
|
||||
let host_url = "s3://";
|
||||
let provider = ColumnQObjectStoreProvider {};
|
||||
let provider = ColumnQObjectStoreRegistry {};
|
||||
|
||||
let err = provider
|
||||
.get_by_url(&Url::from_str(host_url).unwrap())
|
||||
@ -253,7 +276,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn gcs_object_store_type() -> anyhow::Result<()> {
|
||||
let host_url = "gs://bucket_name/path";
|
||||
let provider = ColumnQObjectStoreProvider {};
|
||||
let provider = ColumnQObjectStoreRegistry {};
|
||||
|
||||
let tmp_dir = Builder::new().prefix("columnq.test.gcs").tempdir()?;
|
||||
let tmp_gcs_path = tmp_dir.path().join("service_account.json");
|
||||
@ -280,7 +303,7 @@ mod tests {
|
||||
#[test]
|
||||
fn azure_object_store_type() {
|
||||
let host_url = "az://bucket_name/path";
|
||||
let provider = ColumnQObjectStoreProvider {};
|
||||
let provider = ColumnQObjectStoreRegistry {};
|
||||
// https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio#http-connection-strings
|
||||
env::set_var("AZURE_STORAGE_ACCOUNT_NAME", "devstoreaccount1");
|
||||
env::set_var("AZURE_STORAGE_ACCOUNT_KEY", "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==");
|
||||
@ -299,7 +322,7 @@ mod tests {
|
||||
#[test]
|
||||
fn unknown_object_store_type() {
|
||||
let unknown = "unknown://bucket_name/path";
|
||||
let provider = ColumnQObjectStoreProvider {};
|
||||
let provider = ColumnQObjectStoreRegistry {};
|
||||
let err = provider
|
||||
.get_by_url(&Url::from_str(unknown).unwrap())
|
||||
.unwrap_err();
|
||||
|
||||
@ -42,11 +42,11 @@ mod tests {
|
||||
serde_json::json!([
|
||||
{
|
||||
"d32": "1970-01-02",
|
||||
"d64": "1970-01-01",
|
||||
"d64": "1970-01-01T00:00:00.001",
|
||||
},
|
||||
{
|
||||
"d32": "2021-04-12",
|
||||
"d64": "2021-04-12",
|
||||
"d64": "2021-04-12T04:04:28",
|
||||
},
|
||||
])
|
||||
.to_string(),
|
||||
@ -100,16 +100,16 @@ mod tests {
|
||||
std::str::from_utf8(&data).unwrap(),
|
||||
serde_json::json!([
|
||||
{
|
||||
"sec": "2021-04-12 04:04:28",
|
||||
"msec": "2021-04-12 04:04:28",
|
||||
"usec": "2021-04-12 04:04:28",
|
||||
"nsec": "2021-04-12 04:04:28",
|
||||
"sec": "2021-04-12T04:04:28",
|
||||
"msec": "2021-04-12T04:04:28",
|
||||
"usec": "2021-04-12T04:04:28",
|
||||
"nsec": "2021-04-12T04:04:28",
|
||||
},
|
||||
{
|
||||
"sec": "2021-05-12 04:04:28",
|
||||
"msec": "2021-05-12 04:04:28.001",
|
||||
"usec": "2021-05-12 04:04:28.000002",
|
||||
"nsec": "2021-05-12 04:04:28.000000003",
|
||||
"sec": "2021-05-12T04:04:28",
|
||||
"msec": "2021-05-12T04:04:28.001",
|
||||
"usec": "2021-05-12T04:04:28.000002",
|
||||
"nsec": "2021-05-12T04:04:28.000000003",
|
||||
}
|
||||
])
|
||||
.to_string(),
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
use crate::columnq::ColumnQObjectStoreProvider;
|
||||
use crate::columnq::ColumnQObjectStoreRegistry;
|
||||
use crate::error::ColumnQError;
|
||||
use crate::table::TableSource;
|
||||
use datafusion::datasource::object_store::ObjectStoreProvider;
|
||||
use futures::TryStreamExt;
|
||||
use object_store::ObjectStore;
|
||||
use percent_encoding;
|
||||
@ -27,7 +26,7 @@ where
|
||||
I: Iterator<Item = &'a str>,
|
||||
F: FnMut(std::io::Cursor<Vec<u8>>) -> Result<T, ColumnQError>,
|
||||
{
|
||||
let object_store_provider = ColumnQObjectStoreProvider {};
|
||||
let object_store_provider = ColumnQObjectStoreRegistry {};
|
||||
let mut partitions = vec![];
|
||||
|
||||
for path_str in path_iter {
|
||||
@ -49,7 +48,7 @@ pub async fn partitions_from_uri<'a, F, T>(
|
||||
where
|
||||
F: FnMut(std::io::Cursor<Vec<u8>>) -> Result<T, ColumnQError>,
|
||||
{
|
||||
let object_store_provider = ColumnQObjectStoreProvider {};
|
||||
let object_store_provider = ColumnQObjectStoreRegistry {};
|
||||
let url = &Url::from_str(t.get_uri_str()).unwrap();
|
||||
let client = object_store_provider.get_by_url(url)?;
|
||||
let mut partitions = vec![];
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
use std::convert::TryFrom;
|
||||
use std::sync::Arc;
|
||||
|
||||
use datafusion::arrow;
|
||||
use datafusion::logical_expr::Operator;
|
||||
@ -40,9 +39,7 @@ fn invalid_query(message: String) -> QueryError {
|
||||
// convert order list from graphql argument to datafusion sort columns
|
||||
//
|
||||
// sort order matters, thus it's modeled as a list
|
||||
fn to_datafusion_sort_columns<'b>(
|
||||
sort_columns: &[Value<'b, &'b str>],
|
||||
) -> Result<Vec<Expr>, QueryError> {
|
||||
fn to_datafusion_sort_columns(sort_columns: &[Value<String>]) -> Result<Vec<Expr>, QueryError> {
|
||||
sort_columns
|
||||
.iter()
|
||||
.map(|optval| match optval {
|
||||
@ -82,7 +79,7 @@ fn to_datafusion_sort_columns<'b>(
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn operand_to_datafusion_expr<'b>(operand: &Value<'b, &'b str>) -> Result<Expr, QueryError> {
|
||||
fn operand_to_datafusion_expr(operand: &Value<String>) -> Result<Expr, QueryError> {
|
||||
match operand {
|
||||
Value::Boolean(b) => Ok(Expr::Literal(ScalarValue::Boolean(Some(*b)))),
|
||||
Value::String(s) => Ok(Expr::Literal(ScalarValue::Utf8(Some(s.to_string())))),
|
||||
@ -117,17 +114,14 @@ fn operand_to_datafusion_expr<'b>(operand: &Value<'b, &'b str>) -> Result<Expr,
|
||||
// col4
|
||||
// }
|
||||
// ```
|
||||
fn to_datafusion_predicates<'b>(
|
||||
col: &str,
|
||||
filter: &Value<'b, &'b str>,
|
||||
) -> Result<Vec<Expr>, QueryError> {
|
||||
fn to_datafusion_predicates(col: &str, filter: &Value<String>) -> Result<Vec<Expr>, QueryError> {
|
||||
match filter {
|
||||
Value::Object(obj) => obj
|
||||
.iter()
|
||||
.map(|(op, operand)| {
|
||||
let col_expr = Expr::Column(Column::from_name(col.to_string()));
|
||||
let right_expr = operand_to_datafusion_expr(operand)?;
|
||||
match *op {
|
||||
match op.as_str() {
|
||||
"eq" => Ok(binary_expr(col_expr, Operator::Eq, right_expr)),
|
||||
"lt" => Ok(binary_expr(col_expr, Operator::Lt, right_expr)),
|
||||
"lte" | "lteq" => Ok(binary_expr(col_expr, Operator::LtEq, right_expr)),
|
||||
@ -153,11 +147,11 @@ fn to_datafusion_predicates<'b>(
|
||||
}
|
||||
}
|
||||
|
||||
pub fn query_to_df(
|
||||
pub async fn query_to_df(
|
||||
dfctx: &datafusion::execution::context::SessionContext,
|
||||
q: &str,
|
||||
) -> Result<Arc<datafusion::dataframe::DataFrame>, QueryError> {
|
||||
let doc = parse_query::<&str>(q)?;
|
||||
) -> Result<datafusion::dataframe::DataFrame, QueryError> {
|
||||
let doc = parse_query::<String>(q)?;
|
||||
|
||||
let def = match doc.definitions.len() {
|
||||
1 => match &doc.definitions[0] {
|
||||
@ -228,15 +222,16 @@ pub fn query_to_df(
|
||||
let field = field.ok_or_else(|| invalid_query("field not found in selection".to_string()))?;
|
||||
|
||||
let mut df = dfctx
|
||||
.table(field.name)
|
||||
.map_err(|e| QueryError::invalid_table(e, field.name))?;
|
||||
.table(field.name.as_str())
|
||||
.await
|
||||
.map_err(|e| QueryError::invalid_table(e, field.name.as_str()))?;
|
||||
|
||||
let mut filter = None;
|
||||
let mut sort = None;
|
||||
let mut limit = None;
|
||||
let mut page = None;
|
||||
for (key, value) in &field.arguments {
|
||||
match *key {
|
||||
match key.as_str() {
|
||||
"filter" => {
|
||||
filter = Some(value);
|
||||
}
|
||||
@ -277,7 +272,7 @@ pub fn query_to_df(
|
||||
.items
|
||||
.iter()
|
||||
.map(|selection| match selection {
|
||||
Selection::Field(f) => Ok(f.name),
|
||||
Selection::Field(f) => Ok(f.name.as_str()),
|
||||
_ => Err(QueryError {
|
||||
error: "invalid graphql query".to_string(),
|
||||
message: "selection set in query should only contain Fields".to_string(),
|
||||
@ -352,7 +347,8 @@ pub async fn exec_query(
|
||||
dfctx: &datafusion::execution::context::SessionContext,
|
||||
q: &str,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
|
||||
query_to_df(dfctx, q)?
|
||||
query_to_df(dfctx, q)
|
||||
.await?
|
||||
.collect()
|
||||
.await
|
||||
.map_err(QueryError::query_exec)
|
||||
@ -367,8 +363,8 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::test_util::*;
|
||||
|
||||
#[test]
|
||||
fn simple_query_planning() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn simple_query_planning() -> anyhow::Result<()> {
|
||||
let mut dfctx = SessionContext::new();
|
||||
register_table_properties(&mut dfctx)?;
|
||||
|
||||
@ -386,21 +382,23 @@ mod tests {
|
||||
bath
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
)
|
||||
.await?;
|
||||
|
||||
let expected_df = dfctx
|
||||
.table("properties")?
|
||||
.table("properties")
|
||||
.await?
|
||||
.filter(col("bath").gt_eq(lit(2i64)))?
|
||||
.filter(col("bed").gt(lit(3i64)))?
|
||||
.select(vec![col("address"), col("bed"), col("bath")])?;
|
||||
|
||||
assert_eq_df(df, expected_df);
|
||||
assert_eq_df(df.into(), expected_df.into());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn consistent_and_deterministics_logical_plan() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn consistent_and_deterministics_logical_plan() -> anyhow::Result<()> {
|
||||
let mut dfctx = SessionContext::new();
|
||||
register_table_properties(&mut dfctx)?;
|
||||
|
||||
@ -420,16 +418,18 @@ mod tests {
|
||||
bed
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
)
|
||||
.await?;
|
||||
|
||||
let expected_df = dfctx
|
||||
.table("properties")?
|
||||
.table("properties")
|
||||
.await?
|
||||
.filter(col("bed").gt(lit(3i64)))?
|
||||
.select(vec![col("address"), col("bed")])?
|
||||
.sort(vec![column_sort_expr_asc("bed")])?
|
||||
.limit(0, Some(10))?;
|
||||
|
||||
assert_eq_df(df, expected_df);
|
||||
assert_eq_df(df.into(), expected_df.into());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@ -460,18 +460,12 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
batch.column(0).as_ref(),
|
||||
Arc::new(StringArray::from(vec!["Kenmore, WA", "Fremont, WA",])).as_ref(),
|
||||
&StringArray::from(vec!["Kenmore, WA", "Fremont, WA",]),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
batch.column(1).as_ref(),
|
||||
Arc::new(Int64Array::from(vec![4, 5])).as_ref(),
|
||||
);
|
||||
assert_eq!(batch.column(1).as_ref(), &Int64Array::from(vec![4, 5]),);
|
||||
|
||||
assert_eq!(
|
||||
batch.column(2).as_ref(),
|
||||
Arc::new(Int64Array::from(vec![3, 3])).as_ref(),
|
||||
);
|
||||
assert_eq!(batch.column(2).as_ref(), &Int64Array::from(vec![3, 3]),);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -1,19 +1,20 @@
|
||||
use datafusion::logical_expr::expr::Sort;
|
||||
use datafusion::prelude::{Column, Expr};
|
||||
|
||||
pub fn column_sort_expr_desc(column: String) -> Expr {
|
||||
Expr::Sort {
|
||||
Expr::Sort(Sort {
|
||||
expr: Box::new(Expr::Column(Column::from_name(column))),
|
||||
asc: false,
|
||||
nulls_first: true,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn column_sort_expr_asc(column: impl Into<String>) -> Expr {
|
||||
Expr::Sort {
|
||||
Expr::Sort(Sort {
|
||||
expr: Box::new(Expr::Column(Column::from_name(column))),
|
||||
asc: true,
|
||||
nulls_first: true,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub mod graphql;
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use datafusion::arrow::record_batch::RecordBatch;
|
||||
use datafusion::logical_expr::Operator;
|
||||
@ -54,11 +53,11 @@ fn num_parse_err(e: std::num::ParseIntError) -> QueryError {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn table_query_to_df(
|
||||
pub async fn table_query_to_df(
|
||||
dfctx: &datafusion::execution::context::SessionContext,
|
||||
table_name: &str,
|
||||
params: &HashMap<String, String>,
|
||||
) -> Result<Arc<datafusion::dataframe::DataFrame>, QueryError> {
|
||||
) -> Result<datafusion::dataframe::DataFrame, QueryError> {
|
||||
lazy_static! {
|
||||
static ref RE_REST_FILTER: Regex =
|
||||
Regex::new(r"filter\[(?P<column>.+)\](?P<op>.+)?").unwrap();
|
||||
@ -66,6 +65,7 @@ pub fn table_query_to_df(
|
||||
|
||||
let mut df = dfctx
|
||||
.table(table_name)
|
||||
.await
|
||||
.map_err(|e| QueryError::invalid_table(e, table_name))?;
|
||||
|
||||
// filter[col1]eq='foo'
|
||||
@ -163,7 +163,7 @@ pub async fn query_table(
|
||||
table_name: &str,
|
||||
params: &HashMap<String, String>,
|
||||
) -> Result<Vec<RecordBatch>, QueryError> {
|
||||
let df = table_query_to_df(dfctx, table_name, params)?;
|
||||
let df = table_query_to_df(dfctx, table_name, params).await?;
|
||||
df.collect().await.map_err(QueryError::query_exec)
|
||||
}
|
||||
|
||||
@ -188,18 +188,20 @@ mod tests {
|
||||
params.insert("columns".to_string(), "ami_id,version".to_string());
|
||||
params.insert("filter[arch]".to_string(), "'amd64'".to_string());
|
||||
|
||||
let df = table_query_to_df(&dfctx, "ubuntu_ami", ¶ms)?;
|
||||
let df = table_query_to_df(&dfctx, "ubuntu_ami", ¶ms).await?;
|
||||
|
||||
assert_eq_df(
|
||||
df,
|
||||
df.into(),
|
||||
dfctx
|
||||
.table("ubuntu_ami")?
|
||||
.table("ubuntu_ami")
|
||||
.await?
|
||||
.filter(
|
||||
col("arch").eq(Expr::Literal(ScalarValue::Utf8(Some("amd64".to_string())))),
|
||||
)?
|
||||
.select(vec![col("ami_id"), col("version")])?
|
||||
.sort(vec![column_sort_expr_asc("ami_id")])?
|
||||
.limit(0, Some(10))?,
|
||||
.limit(0, Some(10))?
|
||||
.into(),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@ -221,7 +223,7 @@ mod tests {
|
||||
let batch = &batches[0];
|
||||
assert_eq!(
|
||||
batch.column(0).as_ref(),
|
||||
Arc::new(StringArray::from(vec!["<a href=\"https://console.aws.amazon.com/ec2/home?region=us-east-2#launchAmi=ami-091a87cd1ff23d97c\">ami-091a87cd1ff23d97c</a>"])).as_ref(),
|
||||
&StringArray::from(vec!["<a href=\"https://console.aws.amazon.com/ec2/home?region=us-east-2#launchAmi=ami-091a87cd1ff23d97c\">ami-091a87cd1ff23d97c</a>"]),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use datafusion::arrow;
|
||||
|
||||
use crate::error::QueryError;
|
||||
@ -9,12 +7,12 @@ pub async fn exec_query(
|
||||
sql: &str,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
|
||||
let plan = dfctx
|
||||
.state()
|
||||
.create_logical_plan(sql)
|
||||
.await
|
||||
.map_err(QueryError::plan_sql)?;
|
||||
|
||||
let df: Arc<datafusion::dataframe::DataFrame> = Arc::new(
|
||||
datafusion::dataframe::DataFrame::new(dfctx.state.clone(), &plan),
|
||||
);
|
||||
let df = datafusion::dataframe::DataFrame::new(dfctx.state(), plan);
|
||||
|
||||
df.collect().await.map_err(QueryError::query_exec)
|
||||
}
|
||||
@ -47,15 +45,12 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
batch.column(0).as_ref(),
|
||||
Arc::new(StringArray::from(vec![
|
||||
"Carl", "Daniel", "Mike", "Roger", "Sam",
|
||||
]))
|
||||
.as_ref(),
|
||||
&StringArray::from(vec!["Carl", "Daniel", "Mike", "Roger", "Sam",]),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
batch.column(1).as_ref(),
|
||||
Arc::new(Int64Array::from(vec![3, 3, 4, 3, 2])).as_ref(),
|
||||
&Int64Array::from(vec![3, 3, 4, 3, 2]),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
@ -79,7 +79,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
|
||||
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
|
||||
assert_eq!(stats.num_rows, Some(37 * 3));
|
||||
|
||||
Ok(())
|
||||
@ -92,7 +92,7 @@ mod tests {
|
||||
let t = to_mem_table(&TableSource::new("uk_cities".to_string(), test_path)).await?;
|
||||
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
|
||||
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
|
||||
assert_eq!(stats.num_rows, Some(37));
|
||||
|
||||
Ok(())
|
||||
|
||||
@ -79,7 +79,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
|
||||
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
|
||||
assert_eq!(stats.num_rows, Some(37 * 3));
|
||||
|
||||
Ok(())
|
||||
@ -92,7 +92,7 @@ mod tests {
|
||||
let t = to_mem_table(&TableSource::new("uk_cities".to_string(), test_path)).await?;
|
||||
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
|
||||
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
|
||||
assert_eq!(stats.num_rows, Some(37));
|
||||
|
||||
Ok(())
|
||||
|
||||
@ -132,7 +132,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
|
||||
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
|
||||
assert_eq!(stats.num_rows, Some(37 * 3));
|
||||
|
||||
Ok(())
|
||||
@ -155,7 +155,7 @@ c1,c2,c3
|
||||
let t = to_mem_table(&source).await?;
|
||||
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
|
||||
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
|
||||
assert_eq!(stats.num_rows, Some(3));
|
||||
|
||||
Ok(())
|
||||
|
||||
@ -140,7 +140,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let ctx = SessionContext::new();
|
||||
validate_statistics(t.scan(&ctx.state(), &None, &[], None).await?.statistics());
|
||||
validate_statistics(t.scan(&ctx.state(), None, &[], None).await?.statistics());
|
||||
|
||||
match t.as_any().downcast_ref::<MemTable>() {
|
||||
Some(_) => Ok(()),
|
||||
|
||||
@ -141,7 +141,7 @@ fn infer_schema(rows: &[Vec<String>]) -> Schema {
|
||||
let dt = dt_iter.fold(dt_init, coerce_type);
|
||||
|
||||
// normalize column name by replacing space with under score
|
||||
Field::new(&col_name.replace(' ', "_"), dt, true)
|
||||
Field::new(col_name.replace(' ', "_"), dt, true)
|
||||
})
|
||||
.collect();
|
||||
Schema::new(fields)
|
||||
|
||||
@ -3,6 +3,7 @@ use std::sync::Arc;
|
||||
|
||||
use datafusion::arrow;
|
||||
use datafusion::arrow::datatypes::Schema;
|
||||
#[allow(deprecated)]
|
||||
use datafusion::arrow::json::reader::{Decoder, DecoderOptions};
|
||||
use datafusion::arrow::record_batch::RecordBatch;
|
||||
use serde_json::value::Value;
|
||||
@ -55,6 +56,8 @@ fn json_vec_to_partition(
|
||||
};
|
||||
|
||||
// decode to arrow record batch
|
||||
#[allow(deprecated)]
|
||||
// TODO: switch to RawDecoder
|
||||
let decoder = Decoder::new(
|
||||
Arc::new(schema.clone()),
|
||||
DecoderOptions::new().with_batch_size(batch_size),
|
||||
|
||||
@ -704,7 +704,7 @@ batch_size: 512
|
||||
let ctx = datafusion::prelude::SessionContext::new();
|
||||
let table = load(&t, &ctx).await?;
|
||||
let stats = table
|
||||
.scan(&ctx.state(), &None, &[], None)
|
||||
.scan(&ctx.state(), None, &[], None)
|
||||
.await?
|
||||
.statistics();
|
||||
assert_eq!(stats.num_rows, Some(37));
|
||||
@ -726,7 +726,7 @@ uri: "sqlite://../test_data/sqlite/sample.{}"
|
||||
let ctx = datafusion::prelude::SessionContext::new();
|
||||
let table = load(&t, &ctx).await?;
|
||||
let stats = table
|
||||
.scan(&ctx.state(), &None, &[], None)
|
||||
.scan(&ctx.state(), None, &[], None)
|
||||
.await?
|
||||
.statistics();
|
||||
assert_eq!(stats.num_rows, Some(37));
|
||||
|
||||
@ -2,6 +2,7 @@ use std::io::{BufReader, Read};
|
||||
use std::sync::Arc;
|
||||
|
||||
use datafusion::arrow::datatypes::{Schema, SchemaRef};
|
||||
#[allow(deprecated)]
|
||||
use datafusion::arrow::json::reader::{infer_json_schema, Decoder, DecoderOptions, ValueIter};
|
||||
use datafusion::arrow::record_batch::RecordBatch;
|
||||
|
||||
@ -18,6 +19,8 @@ fn decode_json_from_reader<R: Read>(
|
||||
schema_ref: SchemaRef,
|
||||
batch_size: usize,
|
||||
) -> Result<Vec<RecordBatch>, ColumnQError> {
|
||||
#[allow(deprecated)]
|
||||
// TODO: switch to RawDecoder
|
||||
let decoder = Decoder::new(
|
||||
schema_ref,
|
||||
DecoderOptions::new().with_batch_size(batch_size),
|
||||
|
||||
@ -121,7 +121,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let stats = t
|
||||
.scan(&ctx.state(), &None, &[], None)
|
||||
.scan(&ctx.state(), None, &[], None)
|
||||
.await
|
||||
.unwrap()
|
||||
.statistics();
|
||||
@ -146,7 +146,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
|
||||
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
|
||||
assert_eq!(stats.num_rows, Some(500));
|
||||
|
||||
Ok(())
|
||||
@ -171,7 +171,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
|
||||
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
|
||||
assert_eq!(stats.num_rows, Some(1500));
|
||||
|
||||
Ok(())
|
||||
|
||||
@ -64,7 +64,7 @@ fn infer_schema(r: &Range<calamine::DataType>) -> Result<Schema, ColumnQError> {
|
||||
|
||||
let mut dt_iter = set.iter().cloned();
|
||||
let dt = dt_iter.next().unwrap_or(DataType::Utf8);
|
||||
Field::new(&col_name.replace(' ', "_"), dt, true)
|
||||
Field::new(col_name.replace(' ', "_"), dt, true)
|
||||
})
|
||||
.collect();
|
||||
Ok(Schema::new(fields))
|
||||
@ -182,7 +182,7 @@ sheet_name = "uk_cities_with_headers"
|
||||
let t = to_mem_table(&table_source).await.unwrap();
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t
|
||||
.scan(&ctx.state(), &None, &[], None)
|
||||
.scan(&ctx.state(), None, &[], None)
|
||||
.await
|
||||
.unwrap()
|
||||
.statistics();
|
||||
@ -207,7 +207,7 @@ option:
|
||||
let t = to_mem_table(&table_source).await.unwrap();
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t
|
||||
.scan(&ctx.state(), &None, &[], None)
|
||||
.scan(&ctx.state(), None, &[], None)
|
||||
.await
|
||||
.unwrap()
|
||||
.statistics();
|
||||
|
||||
@ -137,7 +137,7 @@ pub async fn register_table_ubuntu_ami(dfctx: &mut SessionContext) -> anyhow::Re
|
||||
|
||||
pub fn assert_eq_df(df1: Arc<DataFrame>, df2: Arc<DataFrame>) {
|
||||
assert_eq!(
|
||||
format!("{:?}", df1.to_logical_plan()),
|
||||
format!("{:?}", df2.to_logical_plan())
|
||||
format!("{:?}", (*df1).clone().into_optimized_plan()),
|
||||
format!("{:?}", (*df2).clone().into_optimized_plan())
|
||||
);
|
||||
}
|
||||
|
||||
@ -15,7 +15,7 @@ mod mysql {
|
||||
let t = DatabaseLoader::MySQL
|
||||
.to_mem_table(&TableSource::new(name, env::var("MYSQL_URL")?))?;
|
||||
let ctx = SessionContext::new();
|
||||
let stats = t.scan(&ctx.state(), &None, &[], None).await?.statistics();
|
||||
let stats = t.scan(&ctx.state(), None, &[], None).await?.statistics();
|
||||
assert!(stats.num_rows.is_some());
|
||||
}
|
||||
|
||||
|
||||
@ -40,8 +40,8 @@ clap = { version = "3", features = ["color"] }
|
||||
thiserror = "1"
|
||||
anyhow = "1"
|
||||
|
||||
convergence = "0.9"
|
||||
convergence-arrow = "0.9"
|
||||
convergence = "0.11.0"
|
||||
convergence-arrow = "0.11.0"
|
||||
|
||||
[features]
|
||||
default = ["rustls", "snmalloc"]
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use columnq::datafusion::arrow;
|
||||
use columnq::arrow::record_batch::RecordBatch;
|
||||
use columnq::datafusion::dataframe::DataFrame;
|
||||
use columnq::datafusion::error::DataFusionError;
|
||||
use columnq::encoding;
|
||||
@ -61,25 +60,19 @@ pub trait RoapiContext: Send + Sync + 'static {
|
||||
|
||||
async fn table_schema_json_bytes(&self, table_name: &str) -> Result<Vec<u8>, ApiErrResp>;
|
||||
|
||||
async fn query_graphql(
|
||||
&self,
|
||||
query: &str,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError>;
|
||||
async fn query_graphql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError>;
|
||||
|
||||
async fn query_sql(
|
||||
&self,
|
||||
query: &str,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError>;
|
||||
async fn query_sql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError>;
|
||||
|
||||
async fn query_rest_table(
|
||||
&self,
|
||||
table_name: &str,
|
||||
params: &HashMap<String, String>,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError>;
|
||||
) -> Result<Vec<RecordBatch>, QueryError>;
|
||||
|
||||
async fn kv_get(&self, kv_name: &str, key: &str) -> Result<Option<String>, QueryError>;
|
||||
|
||||
async fn sql_to_df(&self, query: &str) -> Result<Arc<DataFrame>, DataFusionError>;
|
||||
async fn sql_to_df(&self, query: &str) -> Result<DataFrame, DataFusionError>;
|
||||
|
||||
async fn get_response_format(&self) -> encoding::ContentType;
|
||||
}
|
||||
@ -119,18 +112,12 @@ impl RoapiContext for RawRoapiContext {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
async fn query_graphql(
|
||||
&self,
|
||||
query: &str,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
|
||||
async fn query_graphql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError> {
|
||||
self.cq.query_graphql(query).await
|
||||
}
|
||||
|
||||
#[inline]
|
||||
async fn query_sql(
|
||||
&self,
|
||||
query: &str,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
|
||||
async fn query_sql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError> {
|
||||
self.cq.query_sql(query).await
|
||||
}
|
||||
|
||||
@ -139,7 +126,7 @@ impl RoapiContext for RawRoapiContext {
|
||||
&self,
|
||||
table_name: &str,
|
||||
params: &HashMap<String, String>,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
|
||||
) -> Result<Vec<RecordBatch>, QueryError> {
|
||||
self.cq.query_rest_table(table_name, params).await
|
||||
}
|
||||
|
||||
@ -149,7 +136,7 @@ impl RoapiContext for RawRoapiContext {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
async fn sql_to_df(&self, query: &str) -> Result<Arc<DataFrame>, DataFusionError> {
|
||||
async fn sql_to_df(&self, query: &str) -> Result<DataFrame, DataFusionError> {
|
||||
self.cq.dfctx.sql(query).await
|
||||
}
|
||||
|
||||
@ -195,19 +182,13 @@ impl RoapiContext for ConcurrentRoapiContext {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
async fn query_graphql(
|
||||
&self,
|
||||
query: &str,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
|
||||
async fn query_graphql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError> {
|
||||
let ctx = self.read().await;
|
||||
ctx.cq.query_graphql(query).await
|
||||
}
|
||||
|
||||
#[inline]
|
||||
async fn query_sql(
|
||||
&self,
|
||||
query: &str,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
|
||||
async fn query_sql(&self, query: &str) -> Result<Vec<RecordBatch>, QueryError> {
|
||||
let ctx = self.read().await;
|
||||
ctx.cq.query_sql(query).await
|
||||
}
|
||||
@ -217,7 +198,7 @@ impl RoapiContext for ConcurrentRoapiContext {
|
||||
&self,
|
||||
table_name: &str,
|
||||
params: &HashMap<String, String>,
|
||||
) -> Result<Vec<arrow::record_batch::RecordBatch>, QueryError> {
|
||||
) -> Result<Vec<RecordBatch>, QueryError> {
|
||||
let ctx = self.read().await;
|
||||
ctx.cq.query_rest_table(table_name, params).await
|
||||
}
|
||||
@ -229,7 +210,7 @@ impl RoapiContext for ConcurrentRoapiContext {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
async fn sql_to_df(&self, query: &str) -> Result<Arc<DataFrame>, DataFusionError> {
|
||||
async fn sql_to_df(&self, query: &str) -> Result<DataFrame, DataFusionError> {
|
||||
let ctx = self.read().await;
|
||||
ctx.cq.dfctx.sql(query).await
|
||||
}
|
||||
|
||||
@ -20,18 +20,19 @@ use crate::context::RoapiContext;
|
||||
use crate::server::RunnableServer;
|
||||
|
||||
fn df_err_to_sql(err: DataFusionError) -> ErrorResponse {
|
||||
ErrorResponse::error(SqlState::DATA_EXCEPTION, err.to_string())
|
||||
ErrorResponse::error(SqlState::DataException, err.to_string())
|
||||
}
|
||||
|
||||
/// A portal built using a logical DataFusion query plan.
|
||||
pub struct DataFusionPortal {
|
||||
df: Arc<DataFrame>,
|
||||
df: DataFrame,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Portal for DataFusionPortal {
|
||||
async fn fetch(&mut self, batch: &mut DataRowBatch) -> Result<(), ErrorResponse> {
|
||||
for arrow_batch in self.df.collect().await.map_err(df_err_to_sql)? {
|
||||
let arrow_batches = self.df.clone().collect().await.map_err(df_err_to_sql)?;
|
||||
for arrow_batch in arrow_batches {
|
||||
record_batch_to_rows(&arrow_batch, batch)?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
Loading…
Reference in New Issue
Block a user