xl-cli-tools

CLI tools for viewing and editing Excel files
Log | Files | Refs | README | LICENSE

commit 6eef68fc23f8d6ad8cdf5e40e5542b8543562d7d
parent 1228da956b1940425e10a130d7f829382cd593d3
Author: Erik Loualiche <[email protected]>
Date:   Fri, 13 Mar 2026 16:06:17 -0500

feat: implement describe mode — summary statistics per column

Replace format_describe stub with real implementation using Polars Series
methods (mean, std, min_reduce, max_reduce, median, n_unique). Outputs a
markdown table with stats as rows and DataFrame columns as columns, showing
"-" for non-numeric types. Add compute_stat helper and is_numeric predicate.
Add integration test for --describe flag and update the stub unit test.

Co-Authored-By: Claude Sonnet 4.6 <[email protected]>

Diffstat:
Mxlcat/src/formatter.rs | 137+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mxlcat/tests/test_integration.rs | 15+++++++++++++++
2 files changed, 145 insertions(+), 7 deletions(-)

diff --git a/xlcat/src/formatter.rs b/xlcat/src/formatter.rs @@ -149,9 +149,112 @@ pub fn format_empty_sheet(sheet: &SheetInfo) -> String { } } -/// Stub for describe mode (implemented in Task 7). -pub fn format_describe(_df: &DataFrame) -> String { - "(describe not yet implemented)\n".to_string() +/// Render summary statistics for each column as a markdown table. +/// +/// Stats are rows, columns are DataFrame columns: +/// | stat | col1 | col2 | ... | +/// |------|------|------|-----| +/// | count | ... | ... | ... | +/// ... +pub fn format_describe(df: &DataFrame) -> String { + let columns = df.get_columns(); + let stats = ["count", "null_count", "mean", "std", "min", "max", "median", "unique"]; + + // Header row + let mut out = String::from("| stat |"); + for col in columns { + out.push_str(&format!(" {} |", col.name())); + } + out.push('\n'); + + // Separator + out.push_str("|------|"); + for _ in columns { + out.push_str("---|"); + } + out.push('\n'); + + // Stat rows + for stat in &stats { + out.push_str(&format!("| {stat} |")); + for col in columns { + let val = compute_stat(col, stat); + out.push_str(&format!(" {val} |")); + } + out.push('\n'); + } + + out +} + +fn compute_stat(col: &Column, stat: &str) -> String { + let series = col.as_materialized_series(); + match stat { + "count" => series.len().to_string(), + "null_count" => series.null_count().to_string(), + "mean" => { + if is_numeric(series.dtype()) { + series.mean().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) + } else { + "-".into() + } + } + "std" => { + if is_numeric(series.dtype()) { + series.std(1).map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) + } else { + "-".into() + } + } + "min" => { + if is_numeric(series.dtype()) { + match series.min_reduce() { + Ok(v) => v.value().to_string(), + Err(_) => "-".into(), + } + } else { + "-".into() + } + } + "max" => { + if is_numeric(series.dtype()) { + match series.max_reduce() { + Ok(v) => v.value().to_string(), + Err(_) => "-".into(), + } + } else { + "-".into() + } + } + "median" => { + if is_numeric(series.dtype()) { + series.median().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) + } else { + "-".into() + } + } + "unique" => match series.n_unique() { + Ok(n) => n.to_string(), + Err(_) => "-".into(), + }, + _ => "-".into(), + } +} + +fn is_numeric(dtype: &DataType) -> bool { + matches!( + dtype, + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 + ) } // --------------------------------------------------------------------------- @@ -356,10 +459,30 @@ mod tests { } #[test] - fn test_format_describe_stub() { - let s = Series::new("x".into(), &[1i64]); - let df = DataFrame::new(vec![s.into_column()]).unwrap(); + fn test_format_describe() { + let s_name = Series::new("name".into(), &["Alice", "Bob", "Carol"]); + let s_val = Series::new("value".into(), &[10i64, 20, 30]); + let df = DataFrame::new(vec![s_name.into_column(), s_val.into_column()]).unwrap(); let out = format_describe(&df); - assert!(out.contains("not yet implemented")); + // Header row contains stat and column names + assert!(out.contains("| stat |")); + assert!(out.contains("| name |")); + assert!(out.contains("| value |")); + // All stat rows are present + assert!(out.contains("| count |")); + assert!(out.contains("| null_count |")); + assert!(out.contains("| mean |")); + assert!(out.contains("| std |")); + assert!(out.contains("| min |")); + assert!(out.contains("| max |")); + assert!(out.contains("| median |")); + assert!(out.contains("| unique |")); + // Non-numeric column shows "-" for mean + assert!(out.contains("| mean | - |")); + // Numeric column has a numeric mean value (not "-") + // count=3 for both + assert!(out.contains("| count | 3 | 3 |")); + // unique=3 for both + assert!(out.contains("| unique | 3 | 3 |")); } } diff --git a/xlcat/tests/test_integration.rs b/xlcat/tests/test_integration.rs @@ -171,6 +171,21 @@ fn test_empty_sheet() { } #[test] +fn test_describe_mode() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("simple.xlsx"); + common::create_simple(&path); + + xlcat() + .arg(path.to_str().unwrap()) + .arg("--describe") + .assert() + .success() + .stdout(predicate::str::contains("count")) + .stdout(predicate::str::contains("mean")); +} + +#[test] fn test_all_without_sheet_on_multi() { let dir = TempDir::new().unwrap(); let path = dir.path().join("multi.xlsx");