From 81a25205b76efa436029265f574cc382e81b8977 Mon Sep 17 00:00:00 2001 From: Maksym Dovhal Date: Thu, 1 Feb 2024 06:16:34 +0200 Subject: [PATCH] Update xlsx table (#316) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR is a fix for issue https://github.com/roapi/roapi/issues/259 List of updates/fixes: * module xlsx renamed to excel. * Allow reading not only xlsx format but also xls, ods, xlsb * Allow Excel DateTime format and transform it to arrow Timestamp(Seconds, None) * Allow using NULLs in any data types and use null value instead of string "null" * Fix issue with incorrect data type inference when multiple data types are detected. * Add possibility to specify data schema in config. * Add new options: -
rows_range_start - rows_range_end - columns_range_start - columns_range_end - schema_inference_lines * Make sheet_name optional and if it is not specified than use first sheet by default
 * Bump calamine crate to version 0.23.1 and add feature "dates" (supporting for DateTime column format) Documentation updates: https://github.com/roapi/docs/pull/20 --- Cargo.lock | 28 +- README.md | 2 +- columnq/Cargo.toml | 2 +- columnq/src/error.rs | 4 +- columnq/src/table/excel.rs | 771 +++++++++++++++++++++++++++++++++++++ columnq/src/table/mod.rs | 37 +- columnq/src/table/xlsx.rs | 272 ------------- test_data/excel_range.ods | Bin 0 -> 10927 bytes 8 files changed, 816 insertions(+), 300 deletions(-) create mode 100644 columnq/src/table/excel.rs delete mode 100644 columnq/src/table/xlsx.rs create mode 100644 test_data/excel_range.ods diff --git a/Cargo.lock b/Cargo.lock index 95ab0bc..66658a5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -851,15 +851,17 @@ dependencies = [ [[package]] name = "calamine" -version = "0.19.1" +version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6381d1037ee9b8a6c8eb97936add0331a1aabd148d5b6f35f1cda6e5dec44f40" +checksum = "47a4d6ea525ea187df1e3a1c4b23469b1cbe60c5bafc1c0ef14b2b8738a8303d" dependencies = [ "byteorder", + "chrono", "codepage", "encoding_rs", "log", - "quick-xml 0.25.0", + "once_cell", + "quick-xml 0.31.0", "serde", "zip", ] @@ -3496,16 +3498,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "quick-xml" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58e21a144a0ffb5fad7b464babcdab934a325ad69b7c0373bcfef5cbd9799ca9" -dependencies = [ - "encoding_rs", - "memchr", -] - [[package]] name = "quick-xml" version = "0.28.2" @@ -3516,6 +3508,16 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "encoding_rs", + "memchr", +] + [[package]] name = "quote" version = "1.0.33" diff --git a/README.md b/README.md index 2b53f71..8bf2016 100644 --- a/README.md +++ b/README.md @@ -318,7 +318,7 @@ Data layer: - [x] JSON - [x] NDJSON - [x] parquet - - [x] xls, xlsx, xlsm, ods: https://github.com/tafia/calamine + - [x] xls, xlsx, xlsb, ods: https://github.com/tafia/calamine - [x] [DeltaLake](https://delta.io/) Misc: diff --git a/columnq/Cargo.toml b/columnq/Cargo.toml index 4d979b7..f2216b5 100644 --- a/columnq/Cargo.toml +++ b/columnq/Cargo.toml @@ -39,7 +39,7 @@ reqwest = { version = "0.11", default-features = false, features = [ "blocking", "json", ] } -calamine = "0.19.1" +calamine = {version = "0.23.1", features = ["dates"]} tokio = { version = "1", features = ["rt-multi-thread"] } futures = "0.3" diff --git a/columnq/src/error.rs b/columnq/src/error.rs index 5aeae09..e290ea1 100644 --- a/columnq/src/error.rs +++ b/columnq/src/error.rs @@ -39,8 +39,8 @@ pub enum ColumnQError { #[error("Error loading Delta table: {0}")] LoadDelta(String), - #[error("Error loading Xlsx table: {0}")] - LoadXlsx(String), + #[error("Error loading Excel table: {0}")] + LoadExcel(String), #[error("Error loading data from HTTP store: {0}")] HttpStore(String), diff --git a/columnq/src/table/excel.rs b/columnq/src/table/excel.rs new file mode 100644 index 0000000..1b35fee --- /dev/null +++ b/columnq/src/table/excel.rs @@ -0,0 +1,771 @@ +use crate::table::{self, TableOptionExcel, TableSchema, TableSource}; +use arrow_schema::TimeUnit; +use calamine::{open_workbook_auto, DataType as ExcelDataType, Range, Reader, Sheets}; +use datafusion::arrow::array::{ + ArrayRef, BooleanArray, DurationSecondArray, NullArray, PrimitiveArray, StringArray, + TimestampSecondArray, +}; +use datafusion::arrow::datatypes::{ + DataType, Date32Type, Date64Type, Field, Float64Type, Int64Type, Schema, +}; +use datafusion::arrow::record_batch::RecordBatch; +use snafu::prelude::*; +use std::collections::HashMap; +use std::sync::Arc; +use std::vec; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to load Excel: {msg}"))] + Load { msg: String }, + #[snafu(display("Incorrect schema: {msg}"))] + IncorrectSchema { msg: String }, + #[snafu(display("Excel schema inference error"))] + SchemaInference, + #[snafu(display("Failed to create record batch: {source}"))] + CreateRecordBatch { + source: datafusion::arrow::error::ArrowError, + }, + #[snafu(display("Failed to open workbook: {source}"))] + OpenWorkbook { source: calamine::Error }, +} + +struct ExcelSubrange<'a> { + rows: calamine::Rows<'a, ExcelDataType>, + columns_range_start: usize, + columns_range_end: usize, + total_rows: usize, + current_row_id: usize, +} + +impl<'a> ExcelSubrange<'a> { + fn new( + range: &'a Range, + rows_range_start: Option, + rows_range_end: Option, + columns_range_start: Option, + columns_range_end: Option, + ) -> ExcelSubrange { + let rows_range_start = rows_range_start.unwrap_or(usize::MIN); + let rows_range_end = rows_range_end + .or(range.end().map(|v| v.0 as usize)) + .unwrap(); + + let mut rows = range.rows(); + if rows_range_start > 0 { + // rows skipping + rows.nth(rows_range_start - 1); + } + + ExcelSubrange { + rows, + columns_range_start: columns_range_start.unwrap_or(usize::MIN), + columns_range_end: columns_range_end.unwrap_or(usize::MAX), + total_rows: rows_range_end - rows_range_start + 1, + current_row_id: 0, + } + } + + fn size(&self) -> usize { + self.total_rows + } +} + +impl<'a> Iterator for ExcelSubrange<'a> { + type Item = &'a [ExcelDataType]; + + fn next(&mut self) -> Option { + if self.current_row_id < self.total_rows { + self.current_row_id += 1; + self.rows + .next() + .map(|x| &x[self.columns_range_start..=self.columns_range_end.min(x.len() - 1)]) + } else { + None + } + } + + fn size_hint(&self) -> (usize, Option) { + (0, Some(self.total_rows)) + } +} + +fn infer_value_type(v: &ExcelDataType) -> Result { + match v { + ExcelDataType::Int(_) => Ok(DataType::Int64), + ExcelDataType::Float(_) => Ok(DataType::Float64), + ExcelDataType::String(_) => Ok(DataType::Utf8), + ExcelDataType::Bool(_) => Ok(DataType::Boolean), + ExcelDataType::DateTime(_) | ExcelDataType::DateTimeIso(_) => { + Ok(DataType::Timestamp(TimeUnit::Second, None)) + } + ExcelDataType::Duration(_) | ExcelDataType::DurationIso(_) => { + Ok(DataType::Duration(TimeUnit::Second)) + } + ExcelDataType::Error(e) => Err(Error::Load { msg: e.to_string() }), + ExcelDataType::Empty => Ok(DataType::Null), + } +} + +fn infer_schema_from_data(mut range: ExcelSubrange) -> Result { + let mut col_types: HashMap<&str, DataType> = HashMap::new(); + let col_names: Vec<&str> = range + .next() + .ok_or(Error::Load { + msg: String::from("Failed to infer schema for empty excel table"), + })? + .iter() + .enumerate() + .map(|(i, c)| { + c.get_string().ok_or_else(|| Error::Load { + msg: format!("The {i}th column name is empty"), + }) + }) + .collect::, _>>()?; + + for row in range { + for (i, col_val) in row.iter().enumerate() { + let col_name = col_names.get(i).ok_or(Error::Load { + msg: String::from( + "Failed to infer schema. Number of values in row is more then column names.", + ), + })?; + let col_type = infer_value_type(col_val)?; + col_types + .entry(col_name) + .and_modify(|ct| { + if !ct.equals_datatype(&col_type) && ct.equals_datatype(&DataType::Null) { + *ct = col_type.clone(); + } + // if column values has more than one not null type then we upcast column type to the most general datatype Utf8. + else if !ct.equals_datatype(&col_type) + && !&col_type.equals_datatype(&DataType::Null) + { + *ct = DataType::Utf8; + } + }) + .or_insert(col_type); + } + } + + let fields: Vec = col_names + .iter() + .map(|col_name| { + let dt = col_types.get(col_name).unwrap_or(&DataType::Utf8).clone(); + Field::new(col_name.replace(' ', "_"), dt, true) + }) + .collect(); + Ok(Schema::new(fields)) +} + +fn infer_schema_from_config(table_schema: &TableSchema) -> Result { + let unsupported_data_types = table_schema + .columns + .iter() + .filter(|c| { + !matches!( + c.data_type, + DataType::Boolean + | DataType::Int64 + | DataType::Float64 + | DataType::Duration(TimeUnit::Second) + | DataType::Date32 + | DataType::Date64 + | DataType::Null + | DataType::Utf8 + | DataType::Timestamp(TimeUnit::Second, None) + ) + }) + .map(|c| c.name.clone()) + .collect::>() + .join(", "); + + if unsupported_data_types.is_empty() { + Ok(table_schema.into()) + } else { + Err(Error::IncorrectSchema{msg: format!("Configured schema for excel file contains unsupported data types in columns {}. Supported datatype: \ + Boolean, Int64, Float64, Date32, Date64, !Timestamp [Second, null], !Duration [Second], Null, Utf8", unsupported_data_types)}) + } +} + +fn empty_or_panic(v: &ExcelDataType, field_name: &String) -> Option { + if v.is_empty() { + None + } else { + panic!("Incorrect value {:?} in column {}", v, field_name) + } +} + +fn infer_schema( + r: &Range, + option: &TableOptionExcel, + schema: &Option, +) -> Result { + let TableOptionExcel { + rows_range_start, + rows_range_end, + columns_range_start, + columns_range_end, + schema_inference_lines, + .. + } = *option; + + if let Some(schema) = schema { + infer_schema_from_config(schema) + } else { + let last_row_for_schema_inference = schema_inference_lines + .map(|r| r + rows_range_start.unwrap_or(0)) + .or(rows_range_end); + + let range = ExcelSubrange::new( + r, + rows_range_start, + last_row_for_schema_inference, + columns_range_start, + columns_range_end, + ); + infer_schema_from_data(range) + } +} + +fn excel_range_to_record_batch( + r: Range, + option: &TableOptionExcel, + schema: Schema, +) -> Result { + let TableOptionExcel { + rows_range_start, + rows_range_end, + columns_range_start, + columns_range_end, + .. + } = *option; + + let arrays = schema + .fields() + .iter() + .enumerate() + .map(|(i, field)| { + let rows = ExcelSubrange::new( + &r, + rows_range_start.map(|x| x + 1).or(Some(1)), // skip first row because it is header + rows_range_end, + columns_range_start, + columns_range_end, + ); + let field_name = field.name(); + + match field.data_type() { + DataType::Boolean => Arc::new( + rows.map(|r| { + r.get(i) + .and_then(|v| v.get_bool().or_else(|| empty_or_panic(v, field_name))) + }) + .collect::(), + ) as ArrayRef, + DataType::Int64 => Arc::new( + rows.map(|r| { + r.get(i) + .and_then(|v| v.get_int().or_else(|| empty_or_panic(v, field_name))) + }) + .collect::>(), + ) as ArrayRef, + DataType::Float64 => Arc::new( + rows.map(|r| { + r.get(i) + .and_then(|v| v.get_float().or_else(|| empty_or_panic(v, field_name))) + }) + .collect::>(), + ) as ArrayRef, + DataType::Duration(TimeUnit::Second) => Arc::new( + rows.map(|r| { + r.get(i).and_then(|v| { + v.as_duration() + .map(|v| v.num_seconds()) + .or_else(|| empty_or_panic(v, field_name)) + }) + }) + .collect::(), + ) as ArrayRef, + DataType::Null => Arc::new(NullArray::new(rows.size())) as ArrayRef, + DataType::Utf8 => Arc::new( + rows.map(|r| { + r.get(i).and_then(|v| match v { + ExcelDataType::Bool(x) => Some(x.to_string()), + ExcelDataType::Float(_) + | ExcelDataType::Int(_) + | ExcelDataType::String(_) => v.as_string(), + ExcelDataType::DateTime(_) | ExcelDataType::DateTimeIso(_) => { + v.as_datetime().map(|x| x.to_string()) + } + ExcelDataType::Duration(_) | ExcelDataType::DurationIso(_) => { + v.as_duration().map(|x| x.to_string()) + } + ExcelDataType::Empty => None, + ExcelDataType::Error(e) => Some(e.to_string()), + }) + }) + .collect::(), + ) as ArrayRef, + DataType::Timestamp(TimeUnit::Second, None) => Arc::new( + rows.map(|r| { + r.get(i).and_then(|v| { + v.as_datetime() + .map(|v| v.and_utc().timestamp()) + .or_else(|| empty_or_panic(v, field_name)) + }) + }) + .collect::(), + ) as ArrayRef, + DataType::Date64 => Arc::new( + rows.map(|r| { + r.get(i).and_then(|v| { + v.as_datetime() + .map(|v| v.timestamp_millis()) + .or_else(|| empty_or_panic(v, field_name)) + }) + }) + .collect::>(), + ) as ArrayRef, + DataType::Date32 => Arc::new( + rows.map(|r| { + r.get(i).and_then(|v| { + v.as_datetime() + .map(|v| (v.timestamp() / 86400) as i32) + .or_else(|| empty_or_panic(v, field_name)) + }) + }) + .collect::>(), + ) as ArrayRef, + unsupported => panic!("Unsupported data type for excel table {:?}", unsupported), + } + }) + .collect::>(); + + RecordBatch::try_new(Arc::new(schema), arrays).context(CreateRecordBatchSnafu) +} + +pub async fn to_mem_table( + t: &TableSource, +) -> Result { + let opt = t + .option + .as_ref() + .ok_or(table::Error::MissingOption {})? + .as_excel()?; + let uri = t.get_uri_str(); + let mut workbook: Sheets<_> = open_workbook_auto(uri) + .context(OpenWorkbookSnafu) + .context(table::LoadExcelSnafu)?; + + let worksheet_range = match &opt.sheet_name { + Some(sheet) => Some(workbook.worksheet_range(sheet)), + None => workbook.worksheet_range_at(0), + }; + + if let Some(Ok(range)) = worksheet_range { + let shema = infer_schema(&range, opt, &t.schema).context(table::LoadExcelSnafu)?; + let batch = + excel_range_to_record_batch(range, opt, shema).context(table::LoadExcelSnafu)?; + let schema_ref = batch.schema(); + let partitions = vec![vec![batch]]; + + datafusion::datasource::MemTable::try_new(schema_ref, partitions) + .context(table::CreateMemTableSnafu) + } else { + Err(Error::Load { + msg: "Failed to open excel file.".to_owned(), + }) + .context(table::LoadExcelSnafu) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::array::{BooleanArray, Float64Array, Int64Array, StringArray}; + use crate::table::{TableColumn, TableIoSource}; + use crate::test_util::*; + use datafusion::datasource::TableProvider; + use datafusion::prelude::SessionContext; + + use calamine::{Cell, DataType as ExcelDataType}; + + #[test] + fn excel_subrange_iteration() { + let range = calamine::Range::::from_sparse(vec![ + Cell::new((0, 0), ExcelDataType::Int(0)), + Cell::new((0, 1), ExcelDataType::Bool(true)), + Cell::new((0, 2), ExcelDataType::Float(0.333)), + Cell::new((1, 0), ExcelDataType::Int(1)), + Cell::new((1, 1), ExcelDataType::Bool(false)), + Cell::new((1, 2), ExcelDataType::Float(1.333)), + Cell::new((2, 0), ExcelDataType::Int(2)), + Cell::new((2, 1), ExcelDataType::Empty), + Cell::new((2, 2), ExcelDataType::Float(2.333)), + Cell::new((3, 0), ExcelDataType::Int(3)), + Cell::new((3, 1), ExcelDataType::Bool(true)), + Cell::new((3, 2), ExcelDataType::Float(3.333)), + ]); + let mut subrange = ExcelSubrange::new(&range, None, None, None, None); + assert_eq!(subrange.size(), 4); + assert_eq!( + subrange.next(), + Some( + &vec![ + ExcelDataType::Int(0), + ExcelDataType::Bool(true), + ExcelDataType::Float(0.333) + ][..] + ) + ); + assert_eq!( + subrange.next(), + Some( + &vec![ + ExcelDataType::Int(1), + ExcelDataType::Bool(false), + ExcelDataType::Float(1.333) + ][..] + ) + ); + assert_eq!( + subrange.next(), + Some( + &vec![ + ExcelDataType::Int(2), + ExcelDataType::Empty, + ExcelDataType::Float(2.333) + ][..] + ) + ); + assert_eq!( + subrange.next(), + Some( + &vec![ + ExcelDataType::Int(3), + ExcelDataType::Bool(true), + ExcelDataType::Float(3.333) + ][..] + ) + ); + assert_eq!(subrange.next(), None); + + let mut subrange = ExcelSubrange::new(&range, Some(1), Some(2), Some(1), Some(1)); + assert_eq!(subrange.size(), 2); + assert_eq!(subrange.next(), Some(&vec![ExcelDataType::Bool(false)][..])); + assert_eq!(subrange.next(), Some(&vec![ExcelDataType::Empty][..])); + assert_eq!(subrange.next(), None); + } + + #[test] + fn inferes_schema_from_data() { + let range = calamine::Range::::from_sparse(vec![ + Cell::new((0, 0), ExcelDataType::String(String::from("int_column"))), + Cell::new((0, 1), ExcelDataType::String(String::from("bool_column"))), + Cell::new((0, 2), ExcelDataType::String(String::from("float column"))), + Cell::new((0, 3), ExcelDataType::String(String::from("string_column"))), + Cell::new( + (0, 4), + ExcelDataType::String(String::from("datetime_column")), + ), + Cell::new( + (0, 5), + ExcelDataType::String(String::from("datetime iso column")), + ), + Cell::new( + (0, 6), + ExcelDataType::String(String::from("duration column")), + ), + Cell::new( + (0, 7), + ExcelDataType::String(String::from("duration iso column")), + ), + Cell::new((1, 0), ExcelDataType::Int(0)), + Cell::new((1, 1), ExcelDataType::Bool(true)), + Cell::new((1, 2), ExcelDataType::Float(0.333)), + Cell::new((1, 3), ExcelDataType::String(String::from("test"))), + Cell::new((1, 4), ExcelDataType::DateTime(44986.12)), + Cell::new((1, 5), ExcelDataType::DateTimeIso(String::from("test"))), + Cell::new((1, 6), ExcelDataType::Duration(44986.12)), + Cell::new((1, 7), ExcelDataType::DurationIso(String::from("test"))), + Cell::new((2, 0), ExcelDataType::Empty), + Cell::new((2, 0), ExcelDataType::Empty), + Cell::new((2, 1), ExcelDataType::Empty), + Cell::new((2, 2), ExcelDataType::Empty), + Cell::new((2, 3), ExcelDataType::Empty), + Cell::new((2, 4), ExcelDataType::Empty), + Cell::new((2, 5), ExcelDataType::Empty), + Cell::new((2, 6), ExcelDataType::Empty), + Cell::new((2, 7), ExcelDataType::Empty), + ]); + + let schema = infer_schema(&range, &TableOptionExcel::default(), &None).unwrap(); + + assert_eq!( + schema, + Schema::new(vec![ + Field::new("int_column", DataType::Int64, true), + Field::new("bool_column", DataType::Boolean, true), + Field::new("float_column", DataType::Float64, true), + Field::new("string_column", DataType::Utf8, true), + Field::new( + "datetime_column", + DataType::Timestamp(TimeUnit::Second, None), + true + ), + Field::new( + "datetime_iso_column", + DataType::Timestamp(TimeUnit::Second, None), + true + ), + Field::new( + "duration_column", + DataType::Duration(TimeUnit::Second), + true + ), + Field::new( + "duration_iso_column", + DataType::Duration(TimeUnit::Second), + true + ), + ]) + ); + + let range = calamine::Range::::from_sparse(vec![ + Cell::new((0, 0), ExcelDataType::String(String::from("test_column"))), + Cell::new((1, 0), ExcelDataType::Int(0)), + Cell::new((2, 0), ExcelDataType::Empty), + Cell::new((2, 0), ExcelDataType::Float(0.5)), + ]); + + let schema = infer_schema(&range, &TableOptionExcel::default(), &None).unwrap(); + + assert_eq!( + schema, + Schema::new(vec![Field::new("test_column", DataType::Utf8, true)]) + ); + + let range = calamine::Range::::from_sparse(vec![ + Cell::new((0, 0), ExcelDataType::String(String::from("int_column"))), + Cell::new((0, 1), ExcelDataType::Empty), + Cell::new((0, 2), ExcelDataType::String(String::from("float column"))), + ]); + + assert!(infer_schema(&range, &TableOptionExcel::default(), &None).is_err()); + + let range = calamine::Range::::from_sparse(vec![ + Cell::new((0, 0), ExcelDataType::String(String::from("column1"))), + Cell::new((0, 1), ExcelDataType::String(String::from("column2"))), + Cell::new((1, 0), ExcelDataType::Int(1)), + Cell::new((1, 1), ExcelDataType::Int(1)), + Cell::new((1, 3), ExcelDataType::Int(1)), + ]); + assert!(infer_schema(&range, &TableOptionExcel::default(), &None).is_err()); + } + + #[test] + fn inferes_schema_from_config() { + let range = calamine::Range::::from_sparse(vec![]); + let table_schema = TableSchema { + columns: vec![ + TableColumn { + name: String::from("float_column"), + data_type: DataType::Float64, + nullable: true, + }, + TableColumn { + name: String::from("integer_column"), + data_type: DataType::Int64, + nullable: true, + }, + ], + }; + let schema = + infer_schema(&range, &TableOptionExcel::default(), &Some(table_schema)).unwrap(); + + assert_eq!( + schema.all_fields(), + vec![ + &Field::new("float_column", DataType::Float64, true), + &Field::new("integer_column", DataType::Int64, true), + ] + ); + + let table_schema = TableSchema { + columns: vec![ + TableColumn { + name: String::from("float_column"), + data_type: DataType::Float16, + nullable: true, + }, + TableColumn { + name: String::from("integer_column"), + data_type: DataType::Int16, + nullable: true, + }, + ], + }; + assert!(infer_schema(&range, &TableOptionExcel::default(), &Some(table_schema)).is_err()); + } + + #[tokio::test] + async fn load_xlsx_with_toml_config() { + let mut table_source: TableSource = toml::from_str( + r#" +name = "test" +uri = "test_data/uk_cities_with_headers.xlsx" +[option] +format = "xlsx" +sheet_name = "uk_cities_with_headers" +"#, + ) + .unwrap(); + // patch uri path with the correct test data path + table_source.io_source = TableIoSource::Uri(test_data_path("uk_cities_with_headers.xlsx")); + + let t = to_mem_table(&table_source).await.unwrap(); + let ctx = SessionContext::new(); + let stats = t + .scan(&ctx.state(), None, &[], None) + .await + .unwrap() + .statistics(); + assert_eq!(stats.num_rows, Some(37)); + } + + #[tokio::test] + async fn load_xlsx_with_yaml_config() { + let mut table_source: TableSource = serde_yaml::from_str( + r#" +name: "test" +uri: "test_data/uk_cities_with_headers.xlsx" +option: + format: "xlsx" + sheet_name: "uk_cities_with_headers" +"#, + ) + .unwrap(); + // patch uri path with the correct test data path + table_source.io_source = TableIoSource::Uri(test_data_path("uk_cities_with_headers.xlsx")); + + let t = to_mem_table(&table_source).await.unwrap(); + let ctx = SessionContext::new(); + let stats = t + .scan(&ctx.state(), None, &[], None) + .await + .unwrap() + .statistics(); + assert_eq!(stats.num_rows, Some(37)); + } + + #[tokio::test] + async fn load_ods_with_custom_range_and_without_sheet_name() { + let mut table_source: TableSource = serde_yaml::from_str( + r#" +name: "test" +uri: "test_data/excel_range.ods" +option: + format: "ods" + rows_range_start: 2 + rows_range_end: 5 + columns_range_start: 1 + columns_range_end: 6 + schema_inference_lines: 3 +"#, + ) + .unwrap(); + // patch uri path with the correct test data path + table_source.io_source = TableIoSource::Uri(test_data_path("excel_range.ods")); + + let t = to_mem_table(&table_source).await.unwrap(); + let ctx = SessionContext::new(); + let stats = t + .scan(&ctx.state(), None, &[], None) + .await + .unwrap() + .statistics(); + assert_eq!(stats.column_statistics.unwrap().len(), 6); + assert_eq!(stats.num_rows, Some(3)); + } + + #[test] + fn transforms_excel_range_to_record_batch() { + let range: calamine::Range = + calamine::Range::::from_sparse(vec![ + Cell::new((0, 0), ExcelDataType::String("float_column".to_string())), + Cell::new((1, 0), ExcelDataType::Float(1.333)), + Cell::new((2, 0), ExcelDataType::Empty), + Cell::new((3, 0), ExcelDataType::Float(3.333)), + Cell::new((0, 1), ExcelDataType::String("integer_column".to_string())), + Cell::new((1, 1), ExcelDataType::Int(1)), + Cell::new((2, 1), ExcelDataType::Int(3)), + Cell::new((3, 1), ExcelDataType::Empty), + Cell::new((0, 2), ExcelDataType::String("boolean_column".to_string())), + Cell::new((1, 2), ExcelDataType::Empty), + Cell::new((2, 2), ExcelDataType::Bool(true)), + Cell::new((3, 2), ExcelDataType::Bool(false)), + Cell::new((0, 3), ExcelDataType::String("string_column".to_string())), + Cell::new((1, 3), ExcelDataType::String("foo".to_string())), + Cell::new((2, 3), ExcelDataType::String("bar".to_string())), + Cell::new((3, 3), ExcelDataType::String("baz".to_string())), + Cell::new((0, 4), ExcelDataType::String("mixed_column".to_string())), + Cell::new((1, 4), ExcelDataType::Float(1.1)), + Cell::new((2, 4), ExcelDataType::Int(1)), + Cell::new((3, 4), ExcelDataType::Empty), + Cell::new((0, 5), ExcelDataType::String("datetime_column".to_string())), + Cell::new((1, 5), ExcelDataType::DateTime(44986.12)), // 2023-03-01T02:52:48 + Cell::new((2, 5), ExcelDataType::Empty), + Cell::new((3, 5), ExcelDataType::DateTime(44900.12)), // 2022-12-05T02:52:48 + ]); + + let shema = infer_schema(&range, &TableOptionExcel::default(), &None).unwrap(); + let rb = excel_range_to_record_batch(range, &TableOptionExcel::default(), shema).unwrap(); + + assert_eq!( + rb.schema().all_fields(), + vec![ + &Field::new("float_column", DataType::Float64, true), + &Field::new("integer_column", DataType::Int64, true), + &Field::new("boolean_column", DataType::Boolean, true), + &Field::new("string_column", DataType::Utf8, true), + &Field::new("mixed_column", DataType::Utf8, true), + &Field::new( + "datetime_column", + DataType::Timestamp(TimeUnit::Second, None), + true + ), + ] + ); + + assert_eq!( + rb.column(0).as_ref(), + Arc::new(Float64Array::from(vec![Some(1.333), None, Some(3.333)])).as_ref(), + ); + assert_eq!( + rb.column(1).as_ref(), + Arc::new(Int64Array::from(vec![Some(1), Some(3), None])).as_ref(), + ); + assert_eq!( + rb.column(2).as_ref(), + Arc::new(BooleanArray::from(vec![None, Some(true), Some(false)])).as_ref(), + ); + assert_eq!( + rb.column(3).as_ref(), + Arc::new(StringArray::from(vec!["foo", "bar", "baz"])).as_ref(), + ); + assert_eq!( + rb.column(4).as_ref(), + Arc::new(StringArray::from(vec![Some("1.1"), Some("1"), None])).as_ref(), + ); + assert_eq!( + rb.column(5).as_ref(), + Arc::new(TimestampSecondArray::from(vec![ + Some(1677639168), // Unix timestamp for 2023-03-01T02:52:48 UTC + None, + Some(1670208768) // Unix timestamp for 2022-12-05T02:52:48 UTC + ])) + .as_ref(), + ); + } +} diff --git a/columnq/src/table/mod.rs b/columnq/src/table/mod.rs index efd1bf3..e187f47 100644 --- a/columnq/src/table/mod.rs +++ b/columnq/src/table/mod.rs @@ -21,11 +21,11 @@ pub mod arrow_ipc_stream; pub mod csv; pub mod database; pub mod delta; +pub mod excel; pub mod google_spreadsheets; pub mod json; pub mod ndjson; pub mod parquet; -pub mod xlsx; #[derive(Debug, Snafu)] pub enum Error { @@ -45,8 +45,8 @@ pub enum Error { LoadArrowIpcFile { source: arrow_ipc_file::Error }, #[snafu(display("Failed to load Google Sheet data: {source}"))] LoadGoogleSheet { source: google_spreadsheets::Error }, - #[snafu(display("Failed to load XLSX data: {source}"))] - LoadXlsx { source: xlsx::Error }, + #[snafu(display("Failed to load Excel data: {source}"))] + LoadExcel { source: excel::Error }, #[snafu(display("Failed to load database data: {source}"))] LoadDatabase { source: database::Error }, #[snafu(display("Failed to cast IO source to memory bytes for source: {table_source}"))] @@ -251,9 +251,14 @@ impl Default for TableOptionParquet { } } -#[derive(Deserialize, Debug, Clone, Eq, PartialEq)] -pub struct TableOptionXlsx { +#[derive(Deserialize, Default, Debug, Clone, Eq, PartialEq)] +pub struct TableOptionExcel { pub sheet_name: Option, + pub rows_range_start: Option, + pub rows_range_end: Option, + pub columns_range_start: Option, + pub columns_range_end: Option, + pub schema_inference_lines: Option, } #[derive(Deserialize, Debug, Clone, Eq, PartialEq)] @@ -298,7 +303,10 @@ pub enum TableLoadOption { jsonl {}, parquet(TableOptionParquet), google_spreadsheet(TableOptionGoogleSpreadsheet), - xlsx(TableOptionXlsx), + xls(TableOptionExcel), + xlsx(TableOptionExcel), + xlsb(TableOptionExcel), + ods(TableOptionExcel), delta(TableOptionDelta), arrow {}, arrows {}, @@ -317,10 +325,10 @@ impl TableLoadOption { } } - pub fn as_xlsx(&self) -> Result<&TableOptionXlsx, Error> { + pub fn as_excel(&self) -> Result<&TableOptionExcel, Error> { match self { - Self::xlsx(opt) => Ok(opt), - _ => Err(Error::ExpectFormatOption { fmt: "xlsx" }), + Self::xls(opt) | Self::xlsx(opt) | Self::xlsb(opt) | Self::ods(opt) => Ok(opt), + _ => Err(Error::ExpectFormatOption { fmt: "excel" }), } } @@ -353,7 +361,10 @@ impl TableLoadOption { Self::csv { .. } => "csv", Self::parquet { .. } => "parquet", Self::google_spreadsheet(_) | Self::delta { .. } => "", + Self::xls { .. } => "xls", Self::xlsx { .. } => "xlsx", + Self::ods { .. } => "ods", + Self::xlsb { .. } => "xlsb", Self::arrow { .. } => "arrow", Self::arrows { .. } => "arrows", Self::mysql { .. } => "mysql", @@ -537,7 +548,7 @@ impl TableSource { match Path::new(uri).extension().and_then(OsStr::to_str) { Some(ext) => match ext { "csv" | "json" | "ndjson" | "jsonl" | "parquet" | "arrow" | "arrows" - | "xlsx" => ext, + | "xls" | "xlsx" | "xlsb" | "ods" => ext, "sqlite" | "sqlite3" | "db" => "sqlite", _ => { return Err(Error::Extension { @@ -628,7 +639,10 @@ pub async fn load( TableLoadOption::google_spreadsheet(_) => { Arc::new(google_spreadsheets::to_mem_table(t).await?) } - TableLoadOption::xlsx { .. } => Arc::new(xlsx::to_mem_table(t).await?), + TableLoadOption::xlsx { .. } + | TableLoadOption::xls { .. } + | TableLoadOption::xlsb { .. } + | TableLoadOption::ods { .. } => Arc::new(excel::to_mem_table(t).await?), TableLoadOption::delta { .. } => delta::to_datafusion_table(t, dfctx).await?, TableLoadOption::arrow { .. } => { Arc::new(arrow_ipc_file::to_mem_table(t, dfctx).await?) @@ -652,6 +666,7 @@ pub async fn load( "json" => Arc::new(json::to_mem_table(t, dfctx).await?), "ndjson" | "jsonl" => Arc::new(ndjson::to_mem_table(t, dfctx).await?), "parquet" => parquet::to_datafusion_table(t, dfctx).await?, + "xls" | "xlsx" | "xlsb" | "ods" => Arc::new(excel::to_mem_table(t).await?), "arrow" => Arc::new(arrow_ipc_file::to_mem_table(t, dfctx).await?), "arrows" => Arc::new(arrow_ipc_stream::to_mem_table(t, dfctx).await?), "mysql" => Arc::new(database::DatabaseLoader::MySQL.to_mem_table(t)?), diff --git a/columnq/src/table/xlsx.rs b/columnq/src/table/xlsx.rs deleted file mode 100644 index 3efc330..0000000 --- a/columnq/src/table/xlsx.rs +++ /dev/null @@ -1,272 +0,0 @@ -use crate::table::{self, TableSource}; -use calamine::{open_workbook, Range, Reader, Xlsx}; -use datafusion::arrow::array::{ArrayRef, BooleanArray, PrimitiveArray, StringArray}; -use datafusion::arrow::datatypes::{DataType, Field, Float64Type, Int64Type, Schema}; -use datafusion::arrow::record_batch::RecordBatch; -use snafu::prelude::*; -use std::collections::{HashMap, HashSet}; -use std::sync::Arc; -use std::vec; - -#[derive(Debug, Snafu)] -pub enum Error { - #[snafu(display("Failed to load XLSX: {msg}"))] - Load { msg: String }, - #[snafu(display("Failed to create record batch: {source}"))] - CreateRecordBatch { - source: datafusion::arrow::error::ArrowError, - }, - #[snafu(display("Failed to open workbook: {source}"))] - OpenWorkbook { source: calamine::XlsxError }, -} - -fn infer_value_type(v: &calamine::DataType) -> Result { - match v { - calamine::DataType::Int(_) if v.get_int().is_some() => Ok(DataType::Int64), - calamine::DataType::Float(_) if v.get_float().is_some() => Ok(DataType::Float64), - calamine::DataType::Bool(_) if v.get_bool().is_some() => Ok(DataType::Boolean), - calamine::DataType::String(_) if v.get_string().is_some() => Ok(DataType::Utf8), - calamine::DataType::Error(e) => Err(Error::Load { msg: e.to_string() }), - // TODO(upstream): support `Date64` - calamine::DataType::DateTime(_) => Err(Error::Load { - msg: "Unsupported data type: DateTime".to_owned(), - }), - calamine::DataType::Empty => Ok(DataType::Null), - _ => Err(Error::Load { - msg: "Failed to parse the cell value".to_owned(), - }), - } -} - -fn infer_schema(r: &Range) -> Result { - let mut col_types: HashMap<&str, HashSet> = HashMap::new(); - let mut rows = r.rows(); - let col_names: Result, _> = rows - .next() - .unwrap() - .iter() - .enumerate() - .map(|(i, c)| { - c.get_string().ok_or_else(|| Error::Load { - msg: format!("The {i}th column name is empty"), - }) - }) - .collect(); - - let col_names = match col_names { - Ok(values) => values, - Err(e) => return Err(e), - }; - - for row in rows { - for (i, col_val) in row.iter().enumerate() { - let col_name = col_names.get(i).unwrap(); - let col_type = infer_value_type(col_val).unwrap(); - let entry = col_types.entry(col_name).or_default(); - entry.insert(col_type); - } - } - - let fields: Vec = col_names - .iter() - .map(|col_name| { - let set = col_types.entry(col_name).or_insert_with(|| { - let mut set = HashSet::new(); - set.insert(DataType::Utf8); - set - }); - - let mut dt_iter = set.iter().cloned(); - let dt = dt_iter.next().unwrap_or(DataType::Utf8); - Field::new(col_name.replace(' ', "_"), dt, true) - }) - .collect(); - Ok(Schema::new(fields)) -} - -fn xlsx_sheet_value_to_record_batch(r: Range) -> Result { - let schema = infer_schema(&r)?; - let arrays = schema - .fields() - .iter() - .enumerate() - .map(|(i, field)| { - let rows = r.rows().skip(1); - match field.data_type() { - DataType::Boolean => Arc::new( - rows.map(|r| r.get(i).map(|v| v.get_bool().unwrap())) - .collect::(), - ) as ArrayRef, - DataType::Int64 => Arc::new( - rows.map(|r| r.get(i).map(|v| v.get_int().unwrap())) - .collect::>(), - ) as ArrayRef, - DataType::Float64 => Arc::new( - rows.map(|r| r.get(i).map(|v| v.get_float().unwrap())) - .collect::>(), - ) as ArrayRef, - _ => Arc::new( - rows.map(|r| r.get(i).map(|v| v.get_string().unwrap_or("null"))) - .collect::(), - ) as ArrayRef, - } - }) - .collect::>(); - - RecordBatch::try_new(Arc::new(schema), arrays).context(CreateRecordBatchSnafu) -} - -pub async fn to_mem_table( - t: &TableSource, -) -> Result { - let opt = t - .option - .as_ref() - .ok_or(table::Error::MissingOption {})? - .as_xlsx()?; - let uri = t.get_uri_str(); - let mut workbook: Xlsx<_> = open_workbook(uri) - .context(OpenWorkbookSnafu) - .context(table::LoadXlsxSnafu)?; - match &opt.sheet_name { - Some(sheet) => { - if let Some(Ok(r)) = workbook.worksheet_range(sheet) { - let batch = xlsx_sheet_value_to_record_batch(r).context(table::LoadXlsxSnafu)?; - let schema_ref = batch.schema(); - let partitions = vec![vec![batch]]; - Ok( - datafusion::datasource::MemTable::try_new(schema_ref, partitions) - .context(table::CreateMemTableSnafu)?, - ) - } else { - Err(Error::Load { - msg: "Failed to open .xlsx file.".to_owned(), - }) - .context(table::LoadXlsxSnafu) - } - } - None => Err(Error::Load { - msg: "`sheet_name` is not specified".to_owned(), - }) - .context(table::LoadXlsxSnafu), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::arrow::array::{BooleanArray, Float64Array, Int64Array, StringArray}; - use crate::table::TableIoSource; - use crate::test_util::*; - use datafusion::datasource::TableProvider; - use datafusion::prelude::SessionContext; - - use calamine::{Cell, DataType as XlsxDataType, Range}; - - fn property_sheet() -> Range { - let cells: Vec> = vec![ - Cell::new((0, 0), XlsxDataType::String("float_column".to_string())), - Cell::new((1, 0), XlsxDataType::Float(1.333)), - Cell::new((2, 0), XlsxDataType::Float(3.333)), - Cell::new((0, 1), XlsxDataType::String("integer_column".to_string())), - Cell::new((1, 1), XlsxDataType::Int(1)), - Cell::new((2, 1), XlsxDataType::Int(3)), - Cell::new((0, 2), XlsxDataType::String("boolean_column".to_string())), - Cell::new((1, 2), XlsxDataType::Bool(true)), - Cell::new((2, 2), XlsxDataType::Bool(false)), - Cell::new((0, 3), XlsxDataType::String("string_column".to_string())), - Cell::new((1, 3), XlsxDataType::String("foo".to_string())), - Cell::new((2, 3), XlsxDataType::String("bar".to_string())), - ]; - calamine::Range::::from_sparse(cells) - } - - #[tokio::test] - async fn load_xlsx_with_toml_config() { - let mut table_source: TableSource = toml::from_str( - r#" -name = "test" -uri = "test_data/uk_cities_with_headers.xlsx" -[option] -format = "xlsx" -sheet_name = "uk_cities_with_headers" -"#, - ) - .unwrap(); - // patch uri path with the correct test data path - table_source.io_source = TableIoSource::Uri(test_data_path("uk_cities_with_headers.xlsx")); - - let t = to_mem_table(&table_source).await.unwrap(); - let ctx = SessionContext::new(); - let stats = t - .scan(&ctx.state(), None, &[], None) - .await - .unwrap() - .statistics(); - assert_eq!(stats.num_rows, Some(37)); - } - - #[tokio::test] - async fn load_xlsx_with_yaml_config() { - let mut table_source: TableSource = serde_yaml::from_str( - r#" -name: "test" -uri: "test_data/uk_cities_with_headers.xlsx" -option: - format: "xlsx" - sheet_name: "uk_cities_with_headers" -"#, - ) - .unwrap(); - // patch uri path with the correct test data path - table_source.io_source = TableIoSource::Uri(test_data_path("uk_cities_with_headers.xlsx")); - - let t = to_mem_table(&table_source).await.unwrap(); - let ctx = SessionContext::new(); - let stats = t - .scan(&ctx.state(), None, &[], None) - .await - .unwrap() - .statistics(); - assert_eq!(stats.num_rows, Some(37)); - } - - #[test] - fn schema_interface() { - let sheet = property_sheet(); - let schema = infer_schema(&sheet).unwrap(); - assert_eq!( - schema, - Schema::new(vec![ - Field::new("float_column", DataType::Float64, true), - Field::new("integer_column", DataType::Int64, true), - Field::new("boolean_column", DataType::Boolean, true), - Field::new("string_column", DataType::Utf8, true), - ]) - ); - } - - #[test] - fn xlsx_value_to_record_batch() { - let sheet = property_sheet(); - let rb = xlsx_sheet_value_to_record_batch(sheet).unwrap(); - - assert_eq!(rb.num_columns(), 4); - assert_eq!( - rb.column(0).as_ref(), - Arc::new(Float64Array::from(vec![1.333, 3.333])).as_ref(), - ); - assert_eq!( - rb.column(1).as_ref(), - Arc::new(Int64Array::from(vec![1, 3])).as_ref(), - ); - assert_eq!( - rb.column(2).as_ref(), - Arc::new(BooleanArray::from(vec![true, false])).as_ref(), - ); - assert_eq!( - rb.column(3).as_ref(), - Arc::new(StringArray::from(vec!["foo", "bar"])).as_ref(), - ); - } -} diff --git a/test_data/excel_range.ods b/test_data/excel_range.ods new file mode 100644 index 0000000000000000000000000000000000000000..7f596e276dd9192fa7a757b388dc2164a7c81ed4 GIT binary patch literal 10927 zcmdsdbwC_Tv+okzAy{w?8r(fN1X(n=yDhdr&_HkqF2OyxYjB6)?(QzZgI@T)dv4A- zH|Klr{rhUScY0>0s=I5de_c~u^3sq{m;eAA0HE}VOwG@VJD33g06dqcO8{$gYh$pB zt+BqXt(CcQazzsM9u zF3gcd005rLt0$StW{%bdAboQy2S)H8RR&v-X^^~}1Tq5N(_H{$DM?Yqr{nljd4z*} z`qiq%$ay+w$jc~;k&~0t(b2K8vhwosii(QL$jB%uDQRkI8X6i}T3R|dIJmjF`S|#R zgoH##M<*pEWo2a*78X`iR5Ua+w70hp3=E8qkI&D~udlD~@9&?Tojp7}JU8_CsA!Da z2>`&CNQnw5f0#Q=LUdTCM(U}ZJt<>m^l_0~esoJM8wZUa>bN^tZ@Xp<9CVipZ3Up+ z_$(4%Ki=Ny<=Av8vyydwiAG0ryBU&!idX>5qoPxdgrdwJZaW(#h8FbZ)a_D-LKOiO z@U1y9a%kL@C^zXSi^~fg+gwCq@@Rt%7PMB9ScDX5;75ac31!lGzq72@#M|uW(!ADW zV-6zu&T2pDaowwNrJaz@<{O5k0lwASuWq{z`s(?3SLHcOwIw};MFn<^g&s4*3RHyi>iyo(@zhZr$vZYX%8O#98L8# zYogIR@0U1~n`p-!QXZ{9JJnw}2sw4;)8eDbBea}XI9t$EJrH=}i@QwX8hRTUb0Ui6 z&%_f#$~wNnnYLzd{#deLBC;-l&_?o~nTt+q>~)zRI9+IPWJab4nIQO3Onjfdl$}cbOZtADd0GTHsY-%A_9L<>(XOqiVIL|HkHD?B5-`|K% z<*8r@IO2$^*bYOzcweRvNx)p9YiAZs!?BxH#Mqo@-%mpC=+Xd|PN}PRJshWuaxpA1 z&Hi$XsMLU@(ErC~!VI%>%f_dUp*^UEeMW^B$N|Wdsr8%KA`LlgB_8 zmr-5{i?OSu^q&9XDxU8;qUd5H7ami;U*LSn#K3aTaa1{}b2VlWI1yN&Z(3kpdKE0N zrffCVX~=irxrYgCmq6Xv#&^!WVF7l6NpoLDf0e69;44#g*`zu(omQz?1V#;Mq2|zg zX)haU(4@W-p1>NnJB9UR0zY zXe%FU75l=8u(TaUk6E%p!>1iRb+oRBsNC;pc{H7csTWz_?2kQ31jNiiY3sob^?{xH zo;Ccge`(jT>8YR0!%EXhr7j#;r1(N|M&*BB11O0)77 zhr#n6zTBP5v6{`O{sLb=P%Y~Wzu(l(2s%QI{`|3DbYE-C4GZynTQ-)5qkShJ9on{I zu1Bm-be$!(TgR5QWy?#$yd-D5I@)_G55{8>f)U_q6Vr$mf+A%~nde2dOrr`HhNuIo%)D^uQ2 zQvV$>2)H`2!8T?*TAgMvrLz>D{l%lEyVMBENcm{5RMwk%{dPMQwJ@_jI!OfX#o1c* zSIhgTT_TR5FngNW>$=;zanfoDsw60mrh)#YOxg~pF}DN?twxPq|A1HV#dCKDIUGL& zEpB&9og~wUxfJ}R4a~^*I4hlt0Z@Gw0)bX`79wwuur_SvqK?>Y@}^UQI*}sc0L=07 z{t8yJf#yke412E_>0%=%mfl9uY6`MlXh9iwSV?()CprXjoesTAh^dlJk(5QsktqlQ zYv9xQlINiL7`nU?Bvv#2b-Hy&&&-XkFThuPVw8J!r<^h)<97zzIkLFD6h# z6(m$rh>7}5w29W>GdtGR=R2Wyv`vUHPw?;Ax@PMO$trgM_e0HV>saBXX`O8dQ6^P; zH#m?5Rhq0|F#ti6bubp3xbNah2xY+A0`Yr#c=yyl^v1-kdpYWkJBS7YH6neph%k^MSr75S2v@YeXN|~4yBF0bs z>tr6w&aD;OIGp%pAPt(q*>HTf+=R~ji)t>@-8fdM^Xqih?^b$Jy2BL1*Sj*2Nm_U^ z6bbKnBR>bXC->@a#}~-m6O6x{`dMDL-$B(SUxLNTQOl;NYTcI|DXL< zMz-rt6Sbi1F?-cM&h739I+|c8JTTA)sNX(ph|oVRQGczK^HB9hmjF0_YSd<+%;Fi9 zW?`v&=Snu=vECHUI0PK%eA2;%jCf(Ru$~j?Acw6xHt8b`=w>it9Z(YAURuO+;4AeP%7%p}u(+ ziP^ZGQR^3Mi~Z08r-a+5<>IqMj&q!bcsJktA8|1D;Y^B4s2QrK!37JU`_^S z=*fozNQudbmI&*8`nL-T2M6~rxAys;KM3im;$RF0n}bXp7(Q5ARjbb1&oQBS9IG<9 z0ta5pwn7@gnd50B8Zk*R+Eg^@$r=?)L@}T5(;0rx)*D9iNo%c7uH@la(snNWLRw1y zmexO00&m+<5pp$SxU*3rHP_kH+J5nIqHFfEi{9-{7b%=a&Rfe$(a8a)j03BB2?H9d z>|h$r5ww_MetdGYdF#vD$;-qy`0DnQI&KsrJQehGQc-8wj$!s7cAk{5>eNMs3;BVW zJNW{u3pZL(ck*|fMr}pCi5EA1vHa0`%0lS0?p%8)*a6LPX%K zMKks={}xydRC^Jd>iHD+>*Tg`0hE?DE0Q>6d_ngO+LYK4@UEHp6i}}QqXt1fv9+`> zh()m$zk?SdcIDWsA&?-CEUYUIotx z8VdIW3iFf;YeyB7Xr>b@BiwM(0z~`>jKu)#&^RJa3U&*tU;DgPqO|kE zklfk`2BUCNCN;mejlz5^G{O>l@WLyAOJrDiU3e94@Ld7RB#8Vjt$;-|tl=12f*F|N zGHbyhs7qw{5IGmqhmfgDWf6h8rC%FXypLawV)xUqNLCtlh!z}42kRvLGsx-`mZDjn zNOd)~++3=%;1u$npS~h-iDwHtW#(e`Q+_+9I*&K*RaZv5p1n;$efjNf*DSA)CpN<@ zQzu?PjGG<1vHQBMVc&<^L#O;t{UP|2d0K=9=R%l2CM#dKNVD^MWQ&adP#Esy>12vZ zSIo$T;amO;4P9DBUvnwoSdk5^4ZzfJk4%am)PIULzms4vL0^`Z(5<^fx4=Lu*O9X{dtUg}ctRx{Xu{ zzK~T&Qu5i<>m($&I4xAdyoS~SFdJLcWu z5?@*CFsH`SiWuvic9qA-*s^>=#czj&Z;btvWG@2GRhG)3L`6UF50du4Qaiy92`tJ? zCvQv7FM(>OrfT6rt*>rw=H554ucXKIBPLSkGnFZP+Rz(tqMt4?QtH&#byuA7jltSQ z7+osyCJ<1cOeUig?F1X^U9uYOv8`ERT!od$F05|pA)6m_DYXyE|LAzgSPRGjA^fa2 z^e8<*%~ZO-wyh1POS~kM?&HwHl25)$&57fBBG#4JK_Jewhk9+F)Qc9u>1$uaDv6=~ySN8FK^+-T13-ieD$0H(S zGBZkLlWCro?@+>!b+Em#%VB(;;LqU6QwuB0RLaE(Qa8HMcz^xEJg@cnPvW94eUQcH z8Sxif`G`uaUmKn7DnL({f0apzGN`-S0Is`!43>=aON6On9bu5b>}QClt~T!Ygxd!1 z;@@izWPQUHNR*Y`_u%@j*aQxO2BVGSQ|gAaNJE{?2zYup843JSJOCRtN zB!K?bYk{MyTRzMv265@UrU01@hp7P+C>jbNdqsklNupnS%-%CSh;jeMZds-!_f@bO z#UDh}ENy%i=-8&k{EFMX$a?y*!E9)HXSzcUXeJiR7on^+5>< z&AfhF(9D%BASQ*!bz->N>c!fWPwWa-exNlU&6t&jd15FDPA|!}?+gW5KbF`Dh8k%k zPJ22*eT7=EjU9FE%Fp)t)NQR*&J}`=trVpB*v(;o{WGBdATErR6RjTN^67&9%vNr@ z)1lhKTdNQyeK*d^JN=QR6e}ryOTA#7lmW}IO$L9xEbR&L-b+Bd8MDGJ!|XS4E}f9U z#T3osjg=k<93+S+hvass__<_kGq@v^IbQA~I7%ykEy_Hh4tvg8axnFG{m))g*V=a% zI+q^F62!;TaCTRE{@n-X(87W0sjWR@`CpA)M%vVo6kj3~!b7{WllUu4V+TfM@AZ<# za&qoLD0fLgh4>P8f0K8ur>ua2VrsL4c}0`gcr@FZ{nkrx>*7skK2+O{Q=WZ`Rz1C6 zod&5orwP<|)219>|I-2za&>FUPE2SGw$&;Qx$|D4);DoqIEKbi;7iSei)n@gAmrZq zUU`9+#pUFa0kln6tC>6Ta^QQNs|O{fu&f@ImA$!l*BIRf*p}m}1yNJY76rZeIT77l zknq7!FY8{{lr$lg+*9`{M}wg9(N@bC2`@}7N>FJ&knL6ZxD+tVqFg6Nk?HJ{nDOl8 zM`+FDe5RekyQufd!oz(~Wvd4zH1;jj$n5r| zLP&+7bdKVfV8K)>LA%e+XBHh2p+{i6cNT`9J4B_0&6>U^T%W#QghLU7$^NJs>$0`` zvG-ECMz_Mlw#GSS{>IoX*73A?=-_eiVhe}%n|zxef&e+~PCbnaGuDUyzAzMqa4>ZH zqrySX5tZz3e4p2SbW&rX;h9Ct2uD#;gwCP8Ld)i&envKJd;nGk!dPbIEkoC zRQk};%~-FO{1QRU=Yl2O7*3JIFYL25=uRJ#mf$$s1>9I-tTe@ARhMA_eoY)7PLC>GnYE{hIF-7)a)^OcllTuc%~R^- z1SSc|6KikZ0@z)#_{_OljMHodRxJmhNO$9c6E447-n`Dtc}x0%C0A~FD6L2t^>sOk zDvZcfs%QB=uY^_=b3uKA>(Uxue}m~tI&;pW!urQBNFDs4;E0XSD=VMV&=aXiPm)h| zwpm|L$R!4@W2Rta3*#i4LiI|z99K)({4mMRjmnxOcOHEoZZE;dhSpzp~a$uPDU6srn@#vMitZOmqjEn;e$%+oaf=|Q2ZQ!IVq@uNC(-N_fwa1hbRcsvf9#j=e zX;Dsl5KAcyu2(#y53Zl}$^>1)h%E+Au)c(Fvn~yjS+6YoxweOEqpsDH)C4tU8D>_a z?6fJF@$+c;I@dFlkz6_SW*=d4eD#sh?w0u3-%#}@m*9Mg5vL3PCx7EPn(F{|u`>Rb zuYIUxZ8OjH^1-vE-PSGyn@hbS7H2-`+e|~#B5yi!usNQ z;O}vDwo4~zl!rr>h~3S0jKaadjkOtA42l`lzRJ>`5sePxB z%`&;F_>C`dovMUK?fd)gOxO!v>D5!Sq>Ru#JyOK@xBy;d#}lOubCk(9djWybiC91) z_=iTqQf|~Q_06+Zl=u$9357tM5I+Vt2cKdshR+F9U#+Bjh_E?B&UcLpOe+MoBqs$( zH92Sk?_mdGGK2`&Gx1mmyy3pleaO$ro=j(B^0p z?UOej1s-cX!lj(1g%cy_Fza6E2-m!b?UKwpFd@?>0@4i1VnIgrG7P5RcKfAjeaYr* zTEiFTGI?!RsJbAW8cQyB&oS~ARN&+5PYk`uYEGGaG~u4lN}$;7AVP3q1iLeEnqR-w zJk75nW<$BhG7rLnc90>=p_josj9eK^ddINe^D#p@!~S89kck~k8V!W-s+Fu682kbm z(CwWf(*T8ZhiM=#igUVAW-KO>z)VD#&Y~^y4v#h{R*x~sx=y;-f_m~!k{lx9vw#2C zPTxQ{S#+)6+L=xL&Uk0R3m4WVpgD6&sOoIruDlvMp^^SzcN|VJDC+YU&hRKp{f)zG zDseO>h$sch6^vtU%jsL#k0cRpukEwkFh%JtJl~RrdKUG*4qaCetB#n~A^;dMon;~i zWd@5%euJ*tMfcYKq{bM3$@&0!Pj$puNReeQ=FnOGh?gV~LE?!g#l!`FvNKK8Y7xa- z2|Ol~noCvIOd#fPl0W#$u9m}HtvqBi*q+Kc)G5*#(J%jPN|>asj1ZwV8!=l55t%(n zlmI%@pS4t2=PhOYA_3*#0ZEd`7b1j(S9K#Hkh6MiT3bj(xA&e}su@BEVnQi+ij?Eg zLGIe7{Q2!g>%`8o)duKSCrT}AT4u6)1!pB6mwVx=-?r3C?Il+&j~5QgvsTO-N6*{|hS%S8>U6&|0Km1xd+^15j&6 zE8ABTZuPiB{gsa2Diy*B3ycKgc6eVb;Ne47EK-s8Plr$VY)b6Jq7NUbX#22Ke|5~; z)ERz&yf_c@)!wcim2ee_JZH$VqfV;k!+c?N&Fr6#Ds>a(tR}Ij=Olg38I}ek5Ii@m|L1bLJ&KYLd znhQF0PQ-!o;2#`ZA9tU^`7}#2w!jyP=>b@|?9_IkT4=f+T7*bX(GC{SP1L0xZFF!p z47?w_RF2b3Nc3^tt`UnU{P?gUB62ovA<1@=8*gX@N6hTzF)3^RMElAX8)UY9e6V_1VbcijFRF6uy=k^8FB48eR`@=FiP@U z`P4}TG;Yl02zxR84}4Gs!+3-Hgzq6d&@Bbj?rQ9h$|RZqk0_@{t1E*2F??KXypsK} zD6*#AN3H3Mrl>B$u8Q2Zu3w&tDfjJ7SMs7R!@zI}ra8n_g<_nQfyr(BrqW^PqIY9W^FprPq!N{I&cjb3sh2xMD< z4&Q9(M(JB3>g|bRf$#5cDmo(4y5TGCFew9T2-**LdZwI`%OYgLVRI@(!m&)&8&Gge z?th|p9F5z_nvQ8sQUsIRGWy==z6x@dIQQi_gh%h!rN~e*uMiulSP|WzqmI}1ZJeNt zflFj+A?QJ-JW50i{%|?v>5+h|dx|wpiJo0&T1>{Ah;qu`VO@4S{u8@7_fVuU?ZQov zH|naA-}Mo<$OS9o0<`EHvo2KsRpH(o6b3o8p+c;Mt zHi1QLvm7+<=mD{Um%Xqu2K-W&9rUd-8aK$0^&0KU*!*HQHKSiPgm;+FYBAoIa z89^MaC9KjJ?t!GvX@x{jXL84fLa$ixi~Hd+HfIEJvkp#hNpE9Ia7gVowpAjmR5c%s)K_-xYLMKV z3r%|>;E)!WveS`Qg@Zf2zm*-HQ|s-|SY($W&zm>CsG;AR&uTZhY&Rjo413zg z#Q*vw^XLqnDlX2^NW4xB(#C9n&hsPE{q8rhyv=GcU$K!lqEV)*wT3^q!lCS%Y0!?7 zeAp$<@{tSpbAAMnEm_WFV=rr{%n{6JwT9DTuX%6tgl<{!Bxrvfy+U`$=TL!!)ut_brbpA%D$YMW8^Cq zt*Fx}!h1t|?m?L9qCNlX=q#lu5&@D)l6M{Bta@YSYt@kNlPrq-wY~;TgUJ|L`rb(} zy}(UmQEVhvX-tnI9s88UoS+Mqub7P$-5pCFDAgd3y^2AOM9F4eW7?l!1sRd5NpWEn zX8o9M=F@5>-9fie3l-nBbj(8cxQ5gmIl*Er=q+W*#7Mqf*4Ucj#W}3%kWeTg)9o7< z&&WTv%seeJQ{Trhde51Or0KsqY!+%q*5vy5a388;Bcs`_L11BqxKY5ad>PEsgjaZX zl;+F0D<*U`$=0BoU#u^jQ}DvI6y6oG=S%uMy~jsu)QU=&ie9XalItgQa;VD zNq6BGN3mkj`_9O3X`PGMq%Nn5&|ryJvpc2j`i=JqtzOz^P|&w3Gn?QfGEX9%%rJp} zPBSUe`L68QWDV$@v@^w>KE<~1Lklhm!!>c&TwT2G3f!vIVqht(B~~*}wso{V)`6QB z5bMDQ;fQSo%5)xscpkc~ALh-d;jP1CmQtxMF0T$tA7GyyT*IAVp8O~OwN&>%Ik^AI zk}wAAKS$F-qnK@cn9%&L9uPL1{2-&mgP@J_dBpa{-Yt^#Xys`dlF$3z-F<>b4`l(L zHyj@|@p8|nTH+>;sAZ_gMNv~i(3VJN6ePPhw)k6C&=awE@F9d%O*9rO)Rl?%d}>xv z=ZUi)!xLk#Y**~EsG6`Vz*km6Oc|fR5A9P&xQLEs+{Qx~anzomC$pxNtipDOr_;c?`&%me)*cIT%`TcA8*EmqWEIcthHxki6M0T_2=WD5*hVAH^lRU7dO5G zPI#3Qp}qHAOrGX>nq zoudgbzR8rxXXwi%8Bn3XEH_>trs`pJjXJ~#9Qs-e-wt0`v}nMvnk>KK|Lltdf-ycZ zh4^-*9<2MU>v&iO3=RkYz~U)@`Om@2=g_^iKFHj}*a6I7Z)7qUJqUWzx9>5ZpYhPl zTSjPTd+NSAi?mjDe>G^v)5Moc&0*2+6|xKV(k9kR#95cas*(=%3BHF^N}sQWt4GdEL1N1RCb!@}ol8g0EZh?ZM7&X1lG$!fSVllnvA86U8btZkCZ zP7bL{$O2pB~|IMZzDl$djL5fuLWb*c4Dh66yUm6kev! z7wkh&FDKo_8h56^-Ij5<)}z-n?vu%+-n$z%cxXyH;A|Bx?0`D9iLFV!P^;SVf}rS$$PkW@FG#>hpO@ zmHD^&{z&L~(%AnH-cwD)24rGx>S+J3R2~NwMzD>Im90L=*oyK0jn4tB4|a4g(6|2& zd<4H~&(_|?)ZWtZ!qx&7# zb3?!S_`BUFtf!I5iYW`xOUa2d{*Ns`VMlSaEU1Iz2|Kv`x@uUe)K#eEup?bRodYmo zVBujpfiVlsGr<_DD%om%Vfy~ZO}KiMyu9GY_~71j%+pcRWNU`IHET*%qZUcVb7T!Qf-a)a|Zo#WZDBR{j=HG z>$ei3a8ut$(Red5*O7J_9TSD2GRDwJFlsvcR@D|z5ezZ<$D?2lh}D?o&RhNO$YP@7 zyxfqMg${5sXI379_qq1Jzx^@)FW@+m@jqgHvivtt&z2AI0u%7h$!t%m{}cU@+V*#@ zzo)DHVo`po{!{YWA5#8!=6~Led4{cDH2t)s{CWTJ@0@>*t32=4{Gv(ZzwF)oo$=4> z)pOY87l~p2ISBK2zCUB{^J?~sC`q62_P^J*zjOT=a{fBND6+qz^dFr61WCWz8zuk! zwEqL3{`z=-@c{tOQ~wviQ~Yl^e+I07mh%i*zewmea{dZn|19Yl9)6MAZzMfK@4vy~ zpQZeDT;;!!@@IhiXE}fE&F*jH`~tlHTPeh^|F@QZ0pP!-{4pK=JWZb=@E55(N%<`x W%1cATJl}@;^ci|u*ZL`+&;AR8nKVBD literal 0 HcmV?d00001