roapi/columnq/src/table/json.rs

202 lines
6.0 KiB
Rust

use std::io::Read;
use std::sync::Arc;
use serde_json::value::Value;
use uriparse::URIReference;
use crate::error::ColumnQError;
use crate::table::{TableLoadOption, TableSource};
fn json_value_from_reader<R: Read>(r: R) -> Result<Value, ColumnQError> {
serde_json::from_reader(r).map_err(ColumnQError::json_parse)
}
async fn load_array_by_path<'a>(
uri: URIReference<'a>,
pointer: Option<&'a str>,
) -> Result<Vec<Value>, ColumnQError> {
let payload: Value = with_reader_from_uri!(json_value_from_reader, uri)?;
let mut value_ref: &Value = &payload;
if let Some(p) = pointer {
match value_ref.pointer(p) {
Some(v) => value_ref = v,
None => {
return Err(ColumnQError::LoadJson(format!(
"Invalid json pointer: {}",
p
)))
}
}
}
match value_ref.as_array() {
Some(arr) => Ok(arr.to_vec()),
None => Err(ColumnQError::LoadJson(format!(
"{} is not an array",
pointer.unwrap_or("JSON data")
))),
}
}
pub async fn to_mem_table(
t: &TableSource,
) -> Result<datafusion::datasource::MemTable, ColumnQError> {
// TODO: make batch size configurable
let batch_size = 1024;
let array_encoded = match &t.option {
Some(TableLoadOption::json { array_encoded, .. }) => array_encoded.unwrap_or(false),
_ => false,
};
if array_encoded && t.schema.is_none() {
return Err(ColumnQError::LoadJson(
"Array encoded option requires manually specified schema".to_string(),
));
}
let pointer = match &t.option {
Some(TableLoadOption::json { pointer, .. }) => pointer.to_owned(),
_ => None,
};
// load array from file
let json_rows = load_array_by_path(t.parsed_uri()?, pointer.as_deref()).await?;
if json_rows.is_empty() {
match pointer {
Some(p) => {
return Err(ColumnQError::LoadJson(format!(
"{} points to an emtpy array",
p
)));
}
None => {
return Err(ColumnQError::LoadJson(
"JSON data is an emtpy array".to_string(),
));
}
}
}
// load schema
let schema_ref: arrow::datatypes::SchemaRef = match &t.schema {
Some(s) => Arc::new(s.into()),
None => arrow::json::reader::infer_json_schema_from_iterator(
json_rows.iter().map(|v| Ok(v.clone())),
)
.map_err(|e| {
ColumnQError::LoadJson(format!("Failed to infer schema from JSON data: {}", e))
})?,
};
// decode to arrow record batch
let decoder = arrow::json::reader::Decoder::new(schema_ref.clone(), batch_size, None);
let batch = {
// enclose values_iter in its own scope so it won't brrow schema_ref til end of this
// function
let mut values_iter: Box<dyn Iterator<Item = arrow::error::Result<Value>>>;
values_iter = if array_encoded {
// convert row array to object based on schema
// TODO: support array_encoded read in arrow json reader instead
Box::new(json_rows.into_iter().map(|json_row| {
let mut m = serde_json::map::Map::new();
schema_ref
.fields()
.iter()
.enumerate()
.try_for_each(|(i, f)| match json_row.get(i) {
Some(x) => {
m.insert(f.name().to_string(), x.clone());
Ok(())
}
None => Err(arrow::error::ArrowError::JsonError(format!(
"arry encoded JSON row missing column {:?} : {:?}",
i, json_row
))),
})?;
Ok(Value::Object(m))
}))
} else {
// no need to convert row since each row is already an object
Box::new(json_rows.into_iter().map(Ok))
};
// decode whole array into single record batch
decoder
.next_batch(&mut values_iter)
.map_err(|e| {
ColumnQError::LoadJson(format!("Failed decode JSON into Arrow record batch: {}", e))
})?
.ok_or_else(|| {
ColumnQError::LoadJson("JSON data results in empty arrow record batch".to_string())
})?
};
let partitions = vec![vec![batch]];
Ok(datafusion::datasource::MemTable::try_new(
schema_ref, partitions,
)?)
}
#[cfg(test)]
mod tests {
use super::*;
use datafusion::datasource::TableProvider;
use crate::test_util::*;
#[tokio::test]
async fn nested_struct_and_lists() -> Result<(), ColumnQError> {
let t = to_mem_table(&TableSource {
name: "spacex_launches".to_string(),
uri: test_data_path("spacex-launches.json"),
schema: None,
option: None,
})
.await?;
let schema = t.schema();
let fields = schema.fields();
let mut obj_keys = fields.iter().map(|f| f.name()).collect::<Vec<_>>();
obj_keys.sort();
let mut expected_obj_keys = vec![
"fairings",
"links",
"static_fire_date_utc",
"static_fire_date_unix",
"tbd",
"net",
"window",
"rocket",
"success",
"details",
"crew",
"ships",
"capsules",
"payloads",
"launchpad",
"auto_update",
"failures",
"flight_number",
"name",
"date_unix",
"date_utc",
"date_local",
"date_precision",
"upcoming",
"cores",
"id",
"launch_library_id",
];
expected_obj_keys.sort();
assert_eq!(obj_keys, expected_obj_keys,);
Ok(())
}
}