reader.rs - xl-cli-tools - CLI tools for viewing and editing Excel files

reader.rs (14180B)
      1 use anyhow::{Context, Result};
      2 use calamine::{open_workbook_auto, Data, Reader};
      3 use polars::prelude::*;
      4 use std::path::Path;
      5 
      6 #[derive(Debug, Clone, Copy, PartialEq)]
      7 enum InferredType {
      8     Int,
      9     Float,
     10     String,
     11     Bool,
     12     DateTime,
     13     Empty,
     14 }
     15 
     16 pub fn read_sheet(path: &Path, sheet_name: &str) -> Result<DataFrame> {
     17     let mut workbook = open_workbook_auto(path)
     18         .with_context(|| format!("Cannot open workbook: {}", path.display()))?;
     19     let range = workbook
     20         .worksheet_range(sheet_name)
     21         .with_context(|| format!("Cannot read sheet: {sheet_name}"))?;
     22     range_to_dataframe(&range)
     23 }
     24 
     25 pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> {
     26     range_to_dataframe_skip(range, 0)
     27 }
     28 
     29 /// Read a sheet, skipping the first `skip` rows before treating the next row as the header.
     30 pub fn read_sheet_with_skip(path: &Path, sheet_name: &str, skip: usize) -> Result<DataFrame> {
     31     let mut workbook = open_workbook_auto(path)
     32         .with_context(|| format!("Cannot open workbook: {}", path.display()))?;
     33     let range = workbook
     34         .worksheet_range(sheet_name)
     35         .with_context(|| format!("Cannot read sheet: {sheet_name}"))?;
     36     range_to_dataframe_skip(&range, skip)
     37 }
     38 
     39 /// Convert a calamine Range to a DataFrame, skipping `skip` rows before the header.
     40 pub fn range_to_dataframe_skip(range: &calamine::Range<Data>, skip: usize) -> Result<DataFrame> {
     41     let rows: Vec<&[Data]> = range.rows().skip(skip).collect();
     42     let cols = if rows.is_empty() {
     43         0
     44     } else {
     45         rows.iter().map(|r| r.len()).max().unwrap_or(0)
     46     };
     47 
     48     if rows.is_empty() || cols == 0 {
     49         return Ok(DataFrame::default());
     50     }
     51 
     52     // First row (after skip) = headers
     53     let headers: Vec<String> = rows[0]
     54         .iter()
     55         .enumerate()
     56         .map(|(i, cell)| match cell {
     57             Data::String(s) => s.clone(),
     58             _ => format!("column_{i}"),
     59         })
     60         .collect();
     61 
     62     if rows.len() == 1 {
     63         // Header only, no data
     64         let series: Vec<Column> = headers
     65             .iter()
     66             .map(|name| {
     67                 Series::new_empty(PlSmallStr::from(name.as_str()), &DataType::Null).into_column()
     68             })
     69             .collect();
     70         return DataFrame::new(series).map_err(Into::into);
     71     }
     72 
     73     let data_rows = &rows[1..];
     74     let mut columns: Vec<Column> = Vec::with_capacity(cols);
     75 
     76     for col_idx in 0..cols {
     77         let cells: Vec<&Data> = data_rows
     78             .iter()
     79             .map(|row| {
     80                 if col_idx < row.len() {
     81                     &row[col_idx]
     82                 } else {
     83                     &Data::Empty
     84                 }
     85             })
     86             .collect();
     87 
     88         let col_type = infer_column_type(&cells);
     89         let series = build_series(&headers[col_idx], &cells, col_type)?;
     90         columns.push(series.into_column());
     91     }
     92 
     93     DataFrame::new(columns).map_err(Into::into)
     94 }
     95 
     96 fn infer_column_type(cells: &[&Data]) -> InferredType {
     97     let mut has_int = false;
     98     let mut has_float = false;
     99     let mut has_string = false;
    100     let mut has_bool = false;
    101     let mut has_datetime = false;
    102     let mut all_empty = true;
    103 
    104     for cell in cells {
    105         match cell {
    106             Data::Empty => {}
    107             Data::String(_) | Data::DateTimeIso(_) | Data::DurationIso(_) => {
    108                 has_string = true;
    109                 all_empty = false;
    110             }
    111             Data::Float(_) => {
    112                 has_float = true;
    113                 all_empty = false;
    114             }
    115             Data::Int(_) => {
    116                 has_int = true;
    117                 all_empty = false;
    118             }
    119             Data::Bool(_) => {
    120                 has_bool = true;
    121                 all_empty = false;
    122             }
    123             Data::DateTime(_) => {
    124                 has_datetime = true;
    125                 all_empty = false;
    126             }
    127             Data::Error(_) => {
    128                 has_string = true;
    129                 all_empty = false;
    130             }
    131         }
    132     }
    133 
    134     if all_empty {
    135         return InferredType::Empty;
    136     }
    137     // String trumps everything
    138     if has_string {
    139         return InferredType::String;
    140     }
    141     // DateTime only if all non-empty cells are datetime
    142     if has_datetime && !has_int && !has_float && !has_bool {
    143         return InferredType::DateTime;
    144     }
    145     // Bool only if all non-empty cells are bool
    146     if has_bool && !has_int && !has_float && !has_datetime {
    147         return InferredType::Bool;
    148     }
    149     // Float if any float or mix of int/float
    150     if has_float {
    151         return InferredType::Float;
    152     }
    153     if has_int {
    154         return InferredType::Int;
    155     }
    156     // Fallback: mixed datetime/bool/etc → string
    157     InferredType::String
    158 }
    159 
    160 fn build_series(name: &str, cells: &[&Data], col_type: InferredType) -> Result<Series> {
    161     let plname = PlSmallStr::from(name);
    162     match col_type {
    163         InferredType::Int => {
    164             let values: Vec<Option<i64>> = cells
    165                 .iter()
    166                 .map(|cell| match cell {
    167                     Data::Int(v) => Some(*v),
    168                     Data::Empty => None,
    169                     _ => None,
    170                 })
    171                 .collect();
    172             Ok(Series::new(plname, &values))
    173         }
    174         InferredType::Float => {
    175             let values: Vec<Option<f64>> = cells
    176                 .iter()
    177                 .map(|cell| match cell {
    178                     Data::Float(v) => Some(*v),
    179                     Data::Int(v) => Some(*v as f64),
    180                     Data::Empty => None,
    181                     _ => None,
    182                 })
    183                 .collect();
    184             Ok(Series::new(plname, &values))
    185         }
    186         InferredType::Bool => {
    187             let values: Vec<Option<bool>> = cells
    188                 .iter()
    189                 .map(|cell| match cell {
    190                     Data::Bool(v) => Some(*v),
    191                     Data::Empty => None,
    192                     _ => None,
    193                 })
    194                 .collect();
    195             Ok(Series::new(plname, &values))
    196         }
    197         InferredType::DateTime => {
    198             // calamine ExcelDateTime wraps a serial date float (days since 1899-12-30)
    199             // Convert to milliseconds since Unix epoch for polars
    200             let values: Vec<Option<i64>> = cells
    201                 .iter()
    202                 .map(|cell| match cell {
    203                     Data::DateTime(v) => {
    204                         let serial = v.as_f64();
    205                         // Excel epoch: 1899-12-30 = -25569 days from Unix epoch
    206                         let days_from_unix = serial - 25569.0;
    207                         let ms = (days_from_unix * 86_400_000.0) as i64;
    208                         Some(ms)
    209                     }
    210                     Data::Empty => None,
    211                     _ => None,
    212                 })
    213                 .collect();
    214             let series = Series::new(plname, &values);
    215             Ok(series.cast(&DataType::Datetime(TimeUnit::Milliseconds, None))?)
    216         }
    217         InferredType::String | InferredType::Empty => {
    218             let values: Vec<Option<String>> = cells
    219                 .iter()
    220                 .map(|cell| match cell {
    221                     Data::String(s) => Some(s.clone()),
    222                     Data::Float(v) => Some(v.to_string()),
    223                     Data::Int(v) => Some(v.to_string()),
    224                     Data::Bool(v) => Some(v.to_string()),
    225                     Data::DateTime(v) => Some(v.as_f64().to_string()),
    226                     Data::Error(e) => Some(format!("{e:?}")),
    227                     Data::DateTimeIso(s) | Data::DurationIso(s) => Some(s.clone()),
    228                     Data::Empty => None,
    229                 })
    230                 .collect();
    231             Ok(Series::new(plname, &values))
    232         }
    233     }
    234 }
    235 
    236 #[cfg(test)]
    237 mod tests {
    238     use super::*;
    239     use rust_xlsxwriter::Workbook;
    240     use tempfile::NamedTempFile;
    241 
    242     /// Create a simple xlsx with mixed types: string, float, float (int-like), bool
    243     fn create_simple(path: &std::path::Path) {
    244         let mut wb = Workbook::new();
    245         let ws = wb.add_worksheet().set_name("Data").unwrap();
    246         ws.write_string(0, 0, "name").unwrap();
    247         ws.write_string(0, 1, "amount").unwrap();
    248         ws.write_string(0, 2, "count").unwrap();
    249         ws.write_string(0, 3, "active").unwrap();
    250 
    251         let names = ["Alice", "Bob", "Charlie", "Diana", "Eve"];
    252         let amounts = [100.50, 200.75, 300.00, 400.25, 500.00];
    253         let counts = [10.0, 20.0, 30.0, 40.0, 50.0];
    254         let active = [true, false, true, false, true];
    255 
    256         for i in 0..5u32 {
    257             ws.write_string(i + 1, 0, names[i as usize]).unwrap();
    258             ws.write_number(i + 1, 1, amounts[i as usize]).unwrap();
    259             ws.write_number(i + 1, 2, counts[i as usize]).unwrap();
    260             ws.write_boolean(i + 1, 3, active[i as usize]).unwrap();
    261         }
    262         wb.save(path).unwrap();
    263     }
    264 
    265     /// Create xlsx with header row only, no data
    266     fn create_empty_data(path: &std::path::Path) {
    267         let mut wb = Workbook::new();
    268         let ws = wb.add_worksheet().set_name("Empty").unwrap();
    269         ws.write_string(0, 0, "col_a").unwrap();
    270         ws.write_string(0, 1, "col_b").unwrap();
    271         wb.save(path).unwrap();
    272     }
    273 
    274     /// Create xlsx with a completely empty sheet
    275     fn create_empty_sheet(path: &std::path::Path) {
    276         let mut wb = Workbook::new();
    277         wb.add_worksheet().set_name("Blank").unwrap();
    278         wb.save(path).unwrap();
    279     }
    280 
    281     #[test]
    282     fn test_infer_int_column() {
    283         let cells = vec![
    284             &Data::Int(1),
    285             &Data::Int(2),
    286             &Data::Empty,
    287             &Data::Int(4),
    288         ];
    289         assert_eq!(infer_column_type(&cells), InferredType::Int);
    290     }
    291 
    292     #[test]
    293     fn test_infer_float_when_mixed_int_float() {
    294         let cells = vec![
    295             &Data::Int(1),
    296             &Data::Float(2.5),
    297             &Data::Int(3),
    298         ];
    299         assert_eq!(infer_column_type(&cells), InferredType::Float);
    300     }
    301 
    302     #[test]
    303     fn test_infer_string_trumps_all() {
    304         let s = Data::String("hello".to_string());
    305         let cells: Vec<&Data> = vec![
    306             &Data::Int(1),
    307             &s,
    308             &Data::Float(3.0),
    309         ];
    310         assert_eq!(infer_column_type(&cells), InferredType::String);
    311     }
    312 
    313     #[test]
    314     fn test_infer_empty_column() {
    315         let cells: Vec<&Data> = vec![&Data::Empty, &Data::Empty];
    316         assert_eq!(infer_column_type(&cells), InferredType::Empty);
    317     }
    318 
    319     #[test]
    320     fn test_infer_bool_column() {
    321         let cells = vec![
    322             &Data::Bool(true),
    323             &Data::Bool(false),
    324             &Data::Empty,
    325         ];
    326         assert_eq!(infer_column_type(&cells), InferredType::Bool);
    327     }
    328 
    329     #[test]
    330     fn test_empty_range() {
    331         let range: calamine::Range<Data> = Default::default();
    332         let df = range_to_dataframe(&range).unwrap();
    333         assert_eq!(df.height(), 0);
    334         assert_eq!(df.width(), 0);
    335     }
    336 
    337     #[test]
    338     fn test_range_to_dataframe_basic() {
    339         let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
    340         create_simple(tmp.path());
    341 
    342         let df = read_sheet(tmp.path(), "Data").unwrap();
    343         assert_eq!(df.height(), 5);
    344         assert_eq!(df.width(), 4);
    345 
    346         let names: Vec<String> = df
    347             .get_column_names()
    348             .iter()
    349             .map(|s| s.to_string())
    350             .collect();
    351         assert_eq!(names, vec!["name", "amount", "count", "active"]);
    352     }
    353 
    354     #[test]
    355     fn test_read_sheet_types() {
    356         let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
    357         create_simple(tmp.path());
    358 
    359         let df = read_sheet(tmp.path(), "Data").unwrap();
    360 
    361         // name column should be String
    362         assert_eq!(df.column("name").unwrap().dtype(), &DataType::String);
    363 
    364         // amount column should be Float64
    365         assert_eq!(df.column("amount").unwrap().dtype(), &DataType::Float64);
    366 
    367         // active column should be Boolean
    368         assert_eq!(df.column("active").unwrap().dtype(), &DataType::Boolean);
    369     }
    370 
    371     #[test]
    372     fn test_read_sheet_header_only() {
    373         let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
    374         create_empty_data(tmp.path());
    375 
    376         let df = read_sheet(tmp.path(), "Empty").unwrap();
    377         assert_eq!(df.height(), 0);
    378         assert_eq!(df.width(), 2);
    379         let names: Vec<String> = df
    380             .get_column_names()
    381             .iter()
    382             .map(|s| s.to_string())
    383             .collect();
    384         assert_eq!(names, vec!["col_a", "col_b"]);
    385     }
    386 
    387     #[test]
    388     fn test_read_sheet_empty_sheet() {
    389         let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
    390         create_empty_sheet(tmp.path());
    391 
    392         let df = read_sheet(tmp.path(), "Blank").unwrap();
    393         assert_eq!(df.height(), 0);
    394         assert_eq!(df.width(), 0);
    395     }
    396 
    397     /// Create xlsx with metadata rows above the real header
    398     fn create_with_metadata_rows(path: &std::path::Path) {
    399         let mut wb = Workbook::new();
    400         let ws = wb.add_worksheet().set_name("Data").unwrap();
    401         ws.write_string(0, 0, "Report Title").unwrap();
    402         ws.write_string(1, 0, "Generated 2024-01-01").unwrap();
    403         ws.write_string(2, 0, "Name").unwrap();
    404         ws.write_string(2, 1, "Value").unwrap();
    405         ws.write_string(3, 0, "Alice").unwrap();
    406         ws.write_number(3, 1, 100.0).unwrap();
    407         ws.write_string(4, 0, "Bob").unwrap();
    408         ws.write_number(4, 1, 200.0).unwrap();
    409         wb.save(path).unwrap();
    410     }
    411 
    412     #[test]
    413     fn test_read_with_skip() {
    414         let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
    415         create_with_metadata_rows(tmp.path());
    416         let df = read_sheet_with_skip(tmp.path(), "Data", 2).unwrap();
    417         let col_names: Vec<String> = df.get_column_names().iter().map(|s| s.to_string()).collect();
    418         assert_eq!(col_names, vec!["Name", "Value"]);
    419         assert_eq!(df.height(), 2);
    420     }
    421 
    422     #[test]
    423     fn test_read_with_skip_zero() {
    424         let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
    425         create_simple(tmp.path());
    426         let df = read_sheet_with_skip(tmp.path(), "Data", 0).unwrap();
    427         assert_eq!(df.height(), 5);
    428         assert_eq!(df.width(), 4);
    429     }
    430 }
	xl-cli-tools CLI tools for viewing and editing Excel files
	Log \| Files \| Refs \| README \| LICENSE