xl-cli-tools

CLI tools for viewing and editing Excel files
Log | Files | Refs | README | LICENSE

commit 01c053c325aa199b67f50d9dbae23a96a43e3196
parent a1ae2c1cd006db764627b32b7c247c4183273b32
Author: Erik Loualiche <[email protected]>
Date:   Wed, 18 Mar 2026 09:07:52 -0400

Merge pull request #1 from LouLouLibs/feat/xldiff

feat: add xldiff — Excel sheet comparison tool
Diffstat:
M.github/workflows/release.yml | 6+++++-
ACLAUDE.md | 20++++++++++++++++++++
MCargo.lock | 2++
MCargo.toml | 9+++++++++
MREADME.md | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Ademo/xlfilter.gif | 0
Ademo/xlfilter.tape | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adocs/superpowers/plans/2026-03-17-xldiff.md | 1894+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adocs/superpowers/specs/2026-03-17-xldiff-design.md | 248+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/bin/xldiff.rs | 826+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/bin/xlfilter.rs | 207+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/diff.rs | 830+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/filter.rs | 736+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/formatter.rs | 10+++++-----
Msrc/lib.rs | 2++
Msrc/reader.rs | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mtests/common/mod.rs | 193+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/test_xldiff.rs | 223+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/test_xlfilter.rs | 384+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
19 files changed, 5771 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml @@ -34,6 +34,7 @@ jobs: mkdir -p dist cp target/${{ matrix.target }}/release/xlcat dist/xlcat-${{ matrix.suffix }} cp target/${{ matrix.target }}/release/xlset dist/xlset-${{ matrix.suffix }} + cp target/${{ matrix.target }}/release/xlfilter dist/xlfilter-${{ matrix.suffix }} - uses: actions/upload-artifact@v4 with: @@ -57,17 +58,20 @@ jobs: # ARM Mac cp artifacts/binaries-aarch64-apple-darwin/xlcat-aarch64-apple-darwin release/ cp artifacts/binaries-aarch64-apple-darwin/xlset-aarch64-apple-darwin release/ + cp artifacts/binaries-aarch64-apple-darwin/xlfilter-aarch64-apple-darwin release/ # x86 Mac cp artifacts/binaries-x86_64-apple-darwin/xlcat-x86_64-apple-darwin release/ cp artifacts/binaries-x86_64-apple-darwin/xlset-x86_64-apple-darwin release/ + cp artifacts/binaries-x86_64-apple-darwin/xlfilter-x86_64-apple-darwin release/ # Demo GIFs cp demo/xlcat.gif release/ cp demo/xlset.gif release/ + cp demo/xlfilter.gif release/ # Make binaries executable - chmod +x release/xlcat-* release/xlset-* + chmod +x release/xlcat-* release/xlset-* release/xlfilter-* - name: Create GitHub Release env: diff --git a/CLAUDE.md b/CLAUDE.md @@ -0,0 +1,20 @@ +# xl-cli-tool + +Rust CLI tools for working with Excel (.xlsx/.xls) files, designed for LLM-assisted analysis (Claude Code). + +## Current tools +- **xlcat** — view/inspect spreadsheets (metadata, schema, stats, data as markdown tables or CSV) +- **xlset** — write cells preserving formatting/formulas/structure +- **xlfilter** — query/filter rows and columns from spreadsheets +- **xldiff** — compare two sheets, report added/removed/modified rows + +## Architecture +- Shared library in `src/lib.rs` (reader, formatter, metadata, cell, writer, filter, diff modules) +- Binary entry points in `src/bin/` +- Dependencies: calamine (read), umya-spreadsheet (write), polars (data), clap (CLI), serde_json (JSON output) + +## Conventions +- Exit codes: 0 = success, 1 = runtime error, 2 = invalid arguments + - Exception: xldiff uses diff(1) convention: 0 = no differences, 1 = differences found, 2 = error +- Output: markdown tables by default, CSV with `--csv` +- Rust edition 2024 diff --git a/Cargo.lock b/Cargo.lock @@ -2160,6 +2160,7 @@ version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ + "indexmap", "itoa", "memchr", "serde", @@ -2977,6 +2978,7 @@ dependencies = [ "polars", "predicates", "rust_xlsxwriter", + "serde_json", "tempfile", "umya-spreadsheet", ] diff --git a/Cargo.toml b/Cargo.toml @@ -21,12 +21,21 @@ path = "src/bin/xlcat.rs" name = "xlset" path = "src/bin/xlset.rs" +[[bin]] +name = "xlfilter" +path = "src/bin/xlfilter.rs" + +[[bin]] +name = "xldiff" +path = "src/bin/xldiff.rs" + [dependencies] calamine = "0.26" polars = { version = "0.46", features = ["dtype-date", "dtype-datetime", "dtype-duration", "csv"] } clap = { version = "4", features = ["derive"] } anyhow = "1" umya-spreadsheet = "2" +serde_json = { version = "1", features = ["preserve_order"] } [profile.release] strip = true diff --git a/README.md b/README.md @@ -8,12 +8,14 @@ <table> <tr> -<td align="center" width="50%"><strong>xlcat</strong> — view</td> -<td align="center" width="50%"><strong>xlset</strong> — edit</td> +<td align="center" width="33%"><strong>xlcat</strong> — view</td> +<td align="center" width="33%"><strong>xlset</strong> — edit</td> +<td align="center" width="33%"><strong>xlfilter</strong> — query</td> </tr> <tr> <td><img src="demo/xlcat.gif" alt="xlcat demo" /></td> <td><img src="demo/xlset.gif" alt="xlset demo" /></td> +<td><img src="demo/xlfilter.gif" alt="xlfilter demo" /></td> </tr> </table> @@ -21,10 +23,11 @@ *** -Two binaries, no runtime dependencies: +Three binaries, no runtime dependencies: - **`xlcat`** — view xlsx/xls files as markdown tables or CSV - **`xlset`** — modify cells in existing xlsx files, preserving formatting +- **`xlfilter`** — filter, sort, and query rows from spreadsheets ## Installation @@ -36,12 +39,14 @@ Download from [Releases](https://github.com/LouLouLibs/xl-cli-tools/releases): # Apple Silicon curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlcat-aarch64-apple-darwin -o ~/.local/bin/xlcat curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlset-aarch64-apple-darwin -o ~/.local/bin/xlset -chmod +x ~/.local/bin/xlcat ~/.local/bin/xlset +curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlfilter-aarch64-apple-darwin -o ~/.local/bin/xlfilter +chmod +x ~/.local/bin/xlcat ~/.local/bin/xlset ~/.local/bin/xlfilter # Intel Mac curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlcat-x86_64-apple-darwin -o ~/.local/bin/xlcat curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlset-x86_64-apple-darwin -o ~/.local/bin/xlset -chmod +x ~/.local/bin/xlcat ~/.local/bin/xlset +curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlfilter-x86_64-apple-darwin -o ~/.local/bin/xlfilter +chmod +x ~/.local/bin/xlcat ~/.local/bin/xlset ~/.local/bin/xlfilter ``` ### From source @@ -115,6 +120,59 @@ xlcat report.xlsx --csv --head 100 > subset.csv - **Multiple sheets:** lists schemas, pick one with `--sheet` - **Large file (>1MB):** schema + first 25 rows (override with `--max-size 5M`) +## xlfilter — Query and Filter + +```bash +# Filter rows by value +xlfilter data.xlsx --where "State=CA" + +# Numeric comparisons +xlfilter data.xlsx --where "Amount>1000" + +# Multiple filters (AND) +xlfilter data.xlsx --where "State=CA" --where "Amount>1000" + +# Select columns (by name or letter) +xlfilter data.xlsx --cols State,Amount,Year +xlfilter data.xlsx --cols A,C,D + +# Sort results +xlfilter data.xlsx --sort "Amount:desc" + +# Limit output +xlfilter data.xlsx --sort "Amount:desc" --limit 10 + +# Contains filter (case-insensitive) +xlfilter data.xlsx --where "Name~john" + +# Head/tail (applied before filtering) +xlfilter data.xlsx --head 100 --where "Status=Active" + +# Skip metadata rows above the real header +xlfilter data.xlsx --skip 2 + +# CSV output for piping +xlfilter data.xlsx --where "Status!=Draft" --csv | other-tool + +# Target a specific sheet +xlfilter data.xlsx --sheet Revenue --where "Amount>5000" +``` + +### Filter operators + +| Operator | Meaning | Example | +|----------|---------|---------| +| `=` | Equals | `State=CA` | +| `!=` | Not equals | `Status!=Draft` | +| `>` | Greater than | `Amount>1000` | +| `<` | Less than | `Year<2024` | +| `>=` | Greater or equal | `Score>=90` | +| `<=` | Less or equal | `Price<=50` | +| `~` | Contains (case-insensitive) | `Name~john` | +| `!~` | Not contains | `Name!~test` | + +Numeric columns compare numerically; string columns compare lexicographically. Row count is printed to stderr. + ## xlset — Edit Excel Cells ```bash @@ -171,11 +229,11 @@ Claude Code skills (`/xlcat` and `/xlset`) are available in [claude-skills](http ## Exit codes -| Code | xlcat | xlset | -|------|-------|-------| -| 0 | Success | Success | -| 1 | Runtime error | Runtime error | -| 2 | Invalid arguments | Invalid arguments | +| Code | Meaning | +|------|---------| +| 0 | Success | +| 1 | Runtime error | +| 2 | Invalid arguments | ## License diff --git a/demo/xlfilter.gif b/demo/xlfilter.gif Binary files differ. diff --git a/demo/xlfilter.tape b/demo/xlfilter.tape @@ -0,0 +1,60 @@ +# VHS tape for recording xlfilter demo GIF. +# Usage: vhs demo/xlfilter.tape +# +# Prerequisites: +# - xlfilter binary built and in PATH +# - Demo xlsx files created: cargo run --example create_demo + +Output demo/xlfilter.gif + +Set FontSize 14 +Set Width 1100 +Set Height 550 +Set Padding 15 +Set Theme "GruvboxDarkHard" + +Set TypingSpeed 80ms + +Set Shell "bash" + +Hide + Type 'export PS1="> "' + Enter + Type "clear" + Enter +Show + +# --- 1. FILTER ROWS --- +Type "xlfilter demo/sales.xlsx --where Region=East" +Enter +Sleep 3s + +# --- 2. NUMERIC COMPARISON --- +Type "xlfilter demo/sales.xlsx --where 'Revenue>12000'" +Enter +Sleep 3s + +# --- 3. MULTIPLE FILTERS (AND) --- +Type "xlfilter demo/sales.xlsx --where Region=East --where 'Revenue>10000'" +Enter +Sleep 3s + +# --- 4. SELECT COLUMNS + SORT --- +Type "xlfilter demo/sales.xlsx --cols Region,Product,Revenue --sort Revenue:desc" +Enter +Sleep 3s + +# --- 5. LIMIT OUTPUT --- +Type "xlfilter demo/sales.xlsx --sort Revenue:desc --limit 3 --cols Product,Revenue" +Enter +Sleep 2s + +# --- 6. CONTAINS FILTER --- +Type "xlfilter demo/sales.xlsx --where 'Product~Widget A' --cols Date,Region,Revenue" +Enter +Sleep 3s + +# --- 7. CSV OUTPUT FOR PIPING --- +Type "xlfilter demo/sales.xlsx --where Region=West --csv" +Enter +Sleep 3s diff --git a/docs/superpowers/plans/2026-03-17-xldiff.md b/docs/superpowers/plans/2026-03-17-xldiff.md @@ -0,0 +1,1894 @@ +# xldiff Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Port go-xldiff to Rust as the `xldiff` binary in this repo, with type-aware comparison and tolerance support. + +**Architecture:** New `src/diff.rs` module for core diff logic (data structures, positional/key-based algorithms, tolerance). New `src/bin/xldiff.rs` binary for CLI parsing, file:sheet argument handling, and four output formatters (text, markdown, json, csv). Reuses existing `reader`, `metadata`, `filter` (for `resolve_column`), and `formatter` modules. + +**Tech Stack:** Rust, polars (DataFrames), calamine (reading), clap (CLI), serde_json (JSON output), std::io::IsTerminal (TTY detection) + +**Spec:** `docs/superpowers/specs/2026-03-17-xldiff-design.md` + +--- + +### Task 1: Cargo.toml & lib.rs setup + +**Files:** +- Modify: `Cargo.toml` +- Modify: `src/lib.rs` + +- [ ] **Step 1: Add xldiff binary entry and dependencies to Cargo.toml** + +Add after the xlfilter `[[bin]]` block: + +```toml +[[bin]] +name = "xldiff" +path = "src/bin/xldiff.rs" +``` + +Add to `[dependencies]`: + +```toml +serde_json = { version = "1", features = ["preserve_order"] } +``` + +- [ ] **Step 2: Add diff module to lib.rs** + +```rust +pub mod cell; +pub mod diff; +pub mod filter; +pub mod formatter; +pub mod metadata; +pub mod reader; +pub mod writer; +``` + +- [ ] **Step 3: Create empty diff.rs so the project compiles** + +Create `src/diff.rs`: + +```rust +// Diff engine for comparing two Excel sheets. +``` + +- [ ] **Step 4: Verify project compiles** + +Run: `cargo check` +Expected: success (no errors) + +- [ ] **Step 5: Commit** + +```bash +git add Cargo.toml src/lib.rs src/diff.rs +git commit -m "chore: add xldiff binary entry, serde deps, empty diff module" +``` + +--- + +### Task 2: Make formatter helpers public + +**Files:** +- Modify: `src/formatter.rs` + +The markdown output formatter in xldiff needs `render_table`, `compute_col_widths`, `render_table_header`, `render_table_rows`, and `format_any_value` from `formatter.rs`. These are currently private. + +- [ ] **Step 1: Make the five helper functions public** + +In `src/formatter.rs`, change: + +- `fn compute_col_widths(` → `pub fn compute_col_widths(` +- `fn render_table_header(` → `pub fn render_table_header(` +- `fn render_table_rows(` → `pub fn render_table_rows(` +- `fn render_table(` → `pub fn render_table(` +- `fn format_any_value(` → `pub fn format_any_value(` + +- [ ] **Step 2: Verify project compiles** + +Run: `cargo check` +Expected: success + +- [ ] **Step 3: Commit** + +```bash +git add src/formatter.rs +git commit -m "refactor: make formatter table helpers public for xldiff reuse" +``` + +--- + +### Task 3: Make `col_letter_to_index` and `resolve_column` available from filter module + +**Files:** +- Modify: `src/filter.rs` + +`col_letter_to_index` is currently private in `filter.rs`. The xldiff binary needs `resolve_column` (already public) and `resolve_columns` (already public), which internally use `col_letter_to_index`. Since `resolve_column` is already public, no changes are needed here — xldiff will call `xlcat::filter::resolve_column()` directly. + +**This task is a no-op** — confirming that the existing public API is sufficient. Skip to Task 4. + +--- + +### Task 4: diff.rs — data structures + +**Files:** +- Modify: `src/diff.rs` + +- [ ] **Step 1: Write the data structures** + +```rust +use polars::prelude::*; + +/// Source file and sheet metadata for display. +#[derive(Debug, Clone)] +pub struct SheetSource { + pub file_name: String, + pub sheet_name: String, +} + +/// A single row from an added or removed set. +#[derive(Debug, Clone)] +pub struct DiffRow { + pub values: Vec<String>, +} + +/// A change in a single cell. +#[derive(Debug, Clone)] +pub struct CellChange { + pub column: String, + pub old_value: String, + pub new_value: String, +} + +/// A row present in both files with cell-level differences. +#[derive(Debug, Clone)] +pub struct ModifiedRow { + pub key: Vec<String>, + pub changes: Vec<CellChange>, +} + +/// Result of comparing two sheets. +#[derive(Debug, Clone)] +pub struct DiffResult { + pub headers: Vec<String>, + pub key_columns: Vec<String>, + pub added: Vec<DiffRow>, + pub removed: Vec<DiffRow>, + pub modified: Vec<ModifiedRow>, + pub source_a: SheetSource, + pub source_b: SheetSource, +} + +impl DiffResult { + pub fn has_differences(&self) -> bool { + !self.added.is_empty() || !self.removed.is_empty() || !self.modified.is_empty() + } +} + +/// Options controlling how the diff is performed. +#[derive(Debug, Clone, Default)] +pub struct DiffOptions { + pub key_columns: Vec<String>, + pub tolerance: Option<f64>, +} +``` + +- [ ] **Step 2: Verify project compiles** + +Run: `cargo check` +Expected: success + +- [ ] **Step 3: Commit** + +```bash +git add src/diff.rs +git commit -m "feat(xldiff): add diff data structures — DiffResult, DiffRow, ModifiedRow, CellChange" +``` + +--- + +### Task 5: diff.rs — positional mode + +**Files:** +- Modify: `src/diff.rs` +- Create: `tests/test_xldiff.rs` (unit tests only at first) + +- [ ] **Step 1: Write the failing test for positional diff** + +Add at the bottom of `src/diff.rs`: + +```rust +#[cfg(test)] +mod tests { + use super::*; + + fn make_source(name: &str) -> SheetSource { + SheetSource { + file_name: format!("{name}.xlsx"), + sheet_name: "Sheet1".to_string(), + } + } + + #[test] + fn test_positional_no_diff() { + let s1 = Series::new("name".into(), &["Alice", "Bob"]); + let s2 = Series::new("score".into(), &[90i64, 80]); + let df_a = DataFrame::new(vec![s1.clone().into_column(), s2.clone().into_column()]).unwrap(); + let df_b = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); + + let opts = DiffOptions::default(); + let result = diff_positional(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert!(!result.has_differences()); + assert!(result.added.is_empty()); + assert!(result.removed.is_empty()); + } + + #[test] + fn test_positional_added_removed() { + let df_a = DataFrame::new(vec![ + Series::new("name".into(), &["Alice", "Bob"]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("name".into(), &["Alice", "Charlie"]).into_column(), + ]).unwrap(); + + let opts = DiffOptions::default(); + let result = diff_positional(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert_eq!(result.removed.len(), 1); + assert_eq!(result.removed[0].values, vec!["Bob"]); + assert_eq!(result.added.len(), 1); + assert_eq!(result.added[0].values, vec!["Charlie"]); + assert!(result.modified.is_empty()); + } + + #[test] + fn test_positional_duplicate_rows() { + let df_a = DataFrame::new(vec![ + Series::new("x".into(), &["A", "A", "A"]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("x".into(), &["A", "A"]).into_column(), + ]).unwrap(); + + let opts = DiffOptions::default(); + let result = diff_positional(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert_eq!(result.removed.len(), 1); + assert!(result.added.is_empty()); + } +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cargo test --lib diff::tests` +Expected: FAIL — `diff_positional` not found + +- [ ] **Step 3: Implement positional diff** + +Add to `src/diff.rs` before the tests module: + +```rust +use crate::formatter; +use std::collections::HashMap; + +/// Format a cell value for hashing/display, using a null sentinel to distinguish null from empty. +fn cell_to_string(col: &Column, idx: usize) -> String { + match col.get(idx) { + Ok(AnyValue::Null) | Err(_) => String::new(), + Ok(v) => formatter::format_any_value(&v), + } +} + +fn cell_to_key_part(col: &Column, idx: usize) -> String { + match col.get(idx) { + Ok(AnyValue::Null) | Err(_) => "\x01NULL\x01".to_string(), + Ok(v) => formatter::format_any_value(&v), + } +} + +/// Convert a DataFrame row to a string key by joining cell values with null bytes. +fn row_to_key(df: &DataFrame, row_idx: usize) -> String { + let cols = df.get_columns(); + let parts: Vec<String> = cols + .iter() + .map(|c| cell_to_key_part(c, row_idx)) + .collect(); + parts.join("\0") +} + +/// Convert a DataFrame row to a Vec<String> of display values. +fn row_to_strings(df: &DataFrame, row_idx: usize) -> Vec<String> { + df.get_columns() + .iter() + .map(|c| cell_to_string(c, row_idx)) + .collect() +} + +/// Positional diff: treats entire row as identity using multiset comparison. +pub fn diff_positional( + df_a: &DataFrame, + df_b: &DataFrame, + _opts: &DiffOptions, + source_a: SheetSource, + source_b: SheetSource, +) -> anyhow::Result<DiffResult> { + let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect(); + let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect(); + + // Warn about mismatched column counts in positional mode + if headers_a.len() != headers_b.len() { + eprintln!( + "warning: column count differs ({} vs {}), padding shorter side with empty values", + headers_a.len(), headers_b.len() + ); + } + + // Use headers from the file with more columns + let headers = if headers_b.len() > headers_a.len() { headers_b } else { headers_a }; + + // Build frequency maps + let mut counts_a: HashMap<String, Vec<usize>> = HashMap::new(); + for i in 0..df_a.height() { + counts_a.entry(row_to_key(df_a, i)).or_default().push(i); + } + let mut counts_b: HashMap<String, Vec<usize>> = HashMap::new(); + for i in 0..df_b.height() { + counts_b.entry(row_to_key(df_b, i)).or_default().push(i); + } + + let mut removed = Vec::new(); + let mut consumed_b: HashMap<String, usize> = HashMap::new(); + + // Walk A: try to match with B + for i in 0..df_a.height() { + let key = row_to_key(df_a, i); + let b_count = counts_b.get(&key).map(|v| v.len()).unwrap_or(0); + let already_consumed = consumed_b.get(&key).copied().unwrap_or(0); + if already_consumed < b_count { + *consumed_b.entry(key).or_insert(0) += 1; + } else { + let mut vals = row_to_strings(df_a, i); + vals.resize(headers.len(), String::new()); // pad if needed + removed.push(DiffRow { values: vals }); + } + } + + // Walk B: anything not consumed is added + let mut consumed_a: HashMap<String, usize> = HashMap::new(); + let mut added = Vec::new(); + for i in 0..df_b.height() { + let key = row_to_key(df_b, i); + let a_count = counts_a.get(&key).map(|v| v.len()).unwrap_or(0); + let already_consumed = consumed_a.get(&key).copied().unwrap_or(0); + if already_consumed < a_count { + *consumed_a.entry(key).or_insert(0) += 1; + } else { + let mut vals = row_to_strings(df_b, i); + vals.resize(headers.len(), String::new()); // pad if needed + added.push(DiffRow { values: vals }); + } + } + + Ok(DiffResult { + headers, + key_columns: vec![], + added, + removed, + modified: vec![], + source_a, + source_b, + }) +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cargo test --lib diff::tests` +Expected: all 3 tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/diff.rs +git commit -m "feat(xldiff): implement positional diff with multiset comparison" +``` + +--- + +### Task 6: diff.rs — key-based mode + +**Files:** +- Modify: `src/diff.rs` + +- [ ] **Step 1: Write failing tests for key-based diff** + +Add to the `tests` module in `src/diff.rs`: + +```rust + #[test] + fn test_keyed_no_diff() { + let df_a = DataFrame::new(vec![ + Series::new("id".into(), &["1", "2"]).into_column(), + Series::new("val".into(), &[10i64, 20]).into_column(), + ]).unwrap(); + let df_b = df_a.clone(); + + let opts = DiffOptions { key_columns: vec!["id".to_string()], ..Default::default() }; + let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert!(!result.has_differences()); + } + + #[test] + fn test_keyed_added_removed() { + let df_a = DataFrame::new(vec![ + Series::new("id".into(), &["1", "2"]).into_column(), + Series::new("val".into(), &[10i64, 20]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("id".into(), &["2", "3"]).into_column(), + Series::new("val".into(), &[20i64, 30]).into_column(), + ]).unwrap(); + + let opts = DiffOptions { key_columns: vec!["id".to_string()], ..Default::default() }; + let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert_eq!(result.removed.len(), 1); + assert_eq!(result.added.len(), 1); + assert_eq!(result.removed[0].values, vec!["1", "10"]); + assert_eq!(result.added[0].values, vec!["3", "30"]); + } + + #[test] + fn test_keyed_modified() { + let df_a = DataFrame::new(vec![ + Series::new("id".into(), &["1", "2"]).into_column(), + Series::new("score".into(), &[90i64, 80]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("id".into(), &["1", "2"]).into_column(), + Series::new("score".into(), &[95i64, 80]).into_column(), + ]).unwrap(); + + let opts = DiffOptions { key_columns: vec!["id".to_string()], ..Default::default() }; + let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert!(result.added.is_empty()); + assert!(result.removed.is_empty()); + assert_eq!(result.modified.len(), 1); + assert_eq!(result.modified[0].key, vec!["1"]); + assert_eq!(result.modified[0].changes.len(), 1); + assert_eq!(result.modified[0].changes[0].column, "score"); + assert_eq!(result.modified[0].changes[0].old_value, "90"); + assert_eq!(result.modified[0].changes[0].new_value, "95"); + } + + #[test] + fn test_keyed_composite_key() { + let df_a = DataFrame::new(vec![ + Series::new("date".into(), &["2024-01", "2024-01"]).into_column(), + Series::new("ticker".into(), &["AAPL", "GOOG"]).into_column(), + Series::new("price".into(), &[150i64, 100]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("date".into(), &["2024-01", "2024-01"]).into_column(), + Series::new("ticker".into(), &["AAPL", "GOOG"]).into_column(), + Series::new("price".into(), &[155i64, 100]).into_column(), + ]).unwrap(); + + let opts = DiffOptions { + key_columns: vec!["date".to_string(), "ticker".to_string()], + ..Default::default() + }; + let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert_eq!(result.modified.len(), 1); + assert_eq!(result.modified[0].key, vec!["2024-01", "AAPL"]); + } +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cargo test --lib diff::tests` +Expected: FAIL — `diff_keyed` not found + +- [ ] **Step 3: Implement key-based diff** + +Add to `src/diff.rs`: + +```rust +/// Row data stored during key-based comparison. +struct KeyedRow { + values: Vec<String>, + key_values: Vec<String>, +} + +/// Key-based diff: match rows by key columns, compare remaining cells. +pub fn diff_keyed( + df_a: &DataFrame, + df_b: &DataFrame, + opts: &DiffOptions, + source_a: SheetSource, + source_b: SheetSource, +) -> anyhow::Result<DiffResult> { + let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect(); + let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect(); + + // Resolve key column indices for both frames + let key_indices_a: Vec<usize> = opts.key_columns.iter() + .map(|k| headers_a.iter().position(|h| h == k) + .ok_or_else(|| anyhow::anyhow!("key column '{}' not found in first file", k))) + .collect::<anyhow::Result<Vec<_>>>()?; + + let key_indices_b: Vec<usize> = opts.key_columns.iter() + .map(|k| headers_b.iter().position(|h| h == k) + .ok_or_else(|| anyhow::anyhow!("key column '{}' not found in second file", k))) + .collect::<anyhow::Result<Vec<_>>>()?; + + // Find common non-key columns for comparison + let non_key_a: Vec<String> = headers_a.iter() + .filter(|h| !opts.key_columns.contains(h)) + .cloned().collect(); + let non_key_b: Vec<String> = headers_b.iter() + .filter(|h| !opts.key_columns.contains(h)) + .cloned().collect(); + let common_non_key: Vec<String> = non_key_a.iter() + .filter(|h| non_key_b.contains(h)) + .cloned().collect(); + + // Warn about columns only in one file + for col in &non_key_a { + if !non_key_b.contains(col) { + eprintln!("warning: column '{}' only in first file, skipping", col); + } + } + for col in &non_key_b { + if !non_key_a.contains(col) { + eprintln!("warning: column '{}' only in second file, skipping", col); + } + } + + // Headers = key columns + union of non-key columns (common first, then only-in-A, only-in-B) + let mut headers = opts.key_columns.clone(); + headers.extend(non_key_a.iter().chain( + non_key_b.iter().filter(|h| !non_key_a.contains(h)) + ).cloned()); + + // Build row maps + let map_a = build_key_map(df_a, &key_indices_a); + let map_b = build_key_map(df_b, &key_indices_b); + + // Detect duplicate keys + check_duplicate_keys(df_a, &key_indices_a, &source_a); + check_duplicate_keys(df_b, &key_indices_b, &source_b); + + let mut added = Vec::new(); + let mut removed = Vec::new(); + let mut modified = Vec::new(); + + // Removed: keys in A but not B + for (key_str, row_a) in &map_a { + if !map_b.contains_key(key_str) { + removed.push(DiffRow { values: row_a.values.clone() }); + } + } + + // Added: keys in B but not A; Modified: keys in both + for (key_str, row_b) in &map_b { + match map_a.get(key_str) { + None => { + added.push(DiffRow { values: row_b.values.clone() }); + } + Some(row_a) => { + let changes = compare_rows( + df_a, df_b, + &headers_a, &headers_b, + row_a, row_b, + &common_non_key, + opts, + ); + if !changes.is_empty() { + modified.push(ModifiedRow { + key: row_a.key_values.clone(), + changes, + }); + } + } + } + } + + Ok(DiffResult { + headers, + key_columns: opts.key_columns.clone(), + added, + removed, + modified, + source_a, + source_b, + }) +} + +fn build_key_map(df: &DataFrame, key_indices: &[usize]) -> HashMap<String, KeyedRow> { + let cols = df.get_columns(); + let mut map = HashMap::new(); + for i in 0..df.height() { + let key_values: Vec<String> = key_indices.iter() + .map(|&idx| formatter::format_any_value(&cols[idx].get(i).unwrap_or(AnyValue::Null))) + .collect(); + let key_str = key_values.join("\0"); + let values: Vec<String> = cols.iter() + .map(|c| formatter::format_any_value(&c.get(i).unwrap_or(AnyValue::Null))) + .collect(); + map.insert(key_str, KeyedRow { values, key_values }); + } + map +} + +fn check_duplicate_keys(df: &DataFrame, key_indices: &[usize], source: &SheetSource) { + let cols = df.get_columns(); + let mut seen: HashMap<String, usize> = HashMap::new(); + for i in 0..df.height() { + let key: Vec<String> = key_indices.iter() + .map(|&idx| formatter::format_any_value(&cols[idx].get(i).unwrap_or(AnyValue::Null))) + .collect(); + let key_str = key.join(", "); + let count = seen.entry(key_str.clone()).or_insert(0); + *count += 1; + if *count == 2 { + eprintln!("warning: duplicate key \"{}\" in {}", key_str, source.file_name); + } + } +} + +/// Compare non-key columns of two matched rows. +fn compare_rows( + df_a: &DataFrame, + df_b: &DataFrame, + headers_a: &[String], + headers_b: &[String], + row_a: &KeyedRow, + row_b: &KeyedRow, + common_columns: &[String], + opts: &DiffOptions, +) -> Vec<CellChange> { + let mut changes = Vec::new(); + for col_name in common_columns { + let idx_a = headers_a.iter().position(|h| h == col_name).unwrap(); + let idx_b = headers_b.iter().position(|h| h == col_name).unwrap(); + let val_a = &row_a.values[idx_a]; + let val_b = &row_b.values[idx_b]; + + let is_equal = if let Some(tol) = opts.tolerance { + values_equal_with_tolerance(val_a, val_b, tol, df_a, df_b, col_name) + } else { + val_a == val_b + }; + + if !is_equal { + changes.push(CellChange { + column: col_name.clone(), + old_value: val_a.clone(), + new_value: val_b.clone(), + }); + } + } + changes +} + +fn is_float_dtype(dt: &DataType) -> bool { + matches!(dt, DataType::Float32 | DataType::Float64) +} + +fn is_int_dtype(dt: &DataType) -> bool { + matches!(dt, DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 + | DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64) +} + +/// Check if two values are equal, applying tolerance for float columns. +/// Tolerance applies to Float64 columns and Int64-vs-Float64 cross-type, +/// but NOT to pure integer columns (spec requirement). +fn values_equal_with_tolerance( + val_a: &str, + val_b: &str, + tolerance: f64, + df_a: &DataFrame, + df_b: &DataFrame, + col_name: &str, +) -> bool { + if let (Ok(a), Ok(b)) = (val_a.parse::<f64>(), val_b.parse::<f64>()) { + // Handle NaN + if a.is_nan() && b.is_nan() { + return true; + } + if a.is_nan() || b.is_nan() { + return false; + } + let type_a = df_a.column(col_name).ok().map(|c| c.dtype().clone()); + let type_b = df_b.column(col_name).ok().map(|c| c.dtype().clone()); + + let a_is_float = type_a.as_ref().map_or(false, is_float_dtype); + let b_is_float = type_b.as_ref().map_or(false, is_float_dtype); + let a_is_int = type_a.as_ref().map_or(false, is_int_dtype); + let b_is_int = type_b.as_ref().map_or(false, is_int_dtype); + + // Pure integer columns: exact comparison, no tolerance + if a_is_int && b_is_int { + return val_a == val_b; + } + // At least one float, or cross-type int/float: apply tolerance + if a_is_float || b_is_float { + return (a - b).abs() <= tolerance; + } + } + // Fallback: exact string comparison + val_a == val_b +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cargo test --lib diff::tests` +Expected: all 7 tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/diff.rs +git commit -m "feat(xldiff): implement key-based diff with composite keys and duplicate detection" +``` + +--- + +### Task 7: diff.rs — tolerance tests + +**Files:** +- Modify: `src/diff.rs` + +- [ ] **Step 1: Write tolerance tests** + +Add to the `tests` module: + +```rust + #[test] + fn test_keyed_tolerance_within() { + let df_a = DataFrame::new(vec![ + Series::new("id".into(), &["1"]).into_column(), + Series::new("price".into(), &[100.001f64]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("id".into(), &["1"]).into_column(), + Series::new("price".into(), &[100.002f64]).into_column(), + ]).unwrap(); + + let opts = DiffOptions { + key_columns: vec!["id".to_string()], + tolerance: Some(0.01), + }; + let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert!(!result.has_differences(), "values within tolerance should be equal"); + } + + #[test] + fn test_keyed_tolerance_exceeded() { + let df_a = DataFrame::new(vec![ + Series::new("id".into(), &["1"]).into_column(), + Series::new("price".into(), &[100.0f64]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("id".into(), &["1"]).into_column(), + Series::new("price".into(), &[100.05f64]).into_column(), + ]).unwrap(); + + let opts = DiffOptions { + key_columns: vec!["id".to_string()], + tolerance: Some(0.01), + }; + let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert_eq!(result.modified.len(), 1); + } + + #[test] + fn test_keyed_nan_handling() { + let df_a = DataFrame::new(vec![ + Series::new("id".into(), &["1"]).into_column(), + Series::new("val".into(), &[f64::NAN]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("id".into(), &["1"]).into_column(), + Series::new("val".into(), &[f64::NAN]).into_column(), + ]).unwrap(); + + let opts = DiffOptions { + key_columns: vec!["id".to_string()], + tolerance: Some(0.01), + }; + let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert!(!result.has_differences(), "NaN == NaN should be equal"); + } +``` + +- [ ] **Step 2: Run tests** + +Run: `cargo test --lib diff::tests` +Expected: all 10 tests PASS + +- [ ] **Step 3: Commit** + +```bash +git add src/diff.rs +git commit -m "test(xldiff): add tolerance and NaN handling tests" +``` + +--- + +### Task 8: diff.rs — public entry point `diff_sheets` + +**Files:** +- Modify: `src/diff.rs` + +- [ ] **Step 1: Write test for the unified entry point** + +Add to tests: + +```rust + #[test] + fn test_diff_sheets_positional() { + let df_a = DataFrame::new(vec![ + Series::new("x".into(), &["a", "b"]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("x".into(), &["a", "c"]).into_column(), + ]).unwrap(); + + let opts = DiffOptions::default(); // no key → positional + let result = diff_sheets(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert_eq!(result.removed.len(), 1); + assert_eq!(result.added.len(), 1); + } + + #[test] + fn test_diff_sheets_keyed() { + let df_a = DataFrame::new(vec![ + Series::new("id".into(), &["1"]).into_column(), + Series::new("v".into(), &[10i64]).into_column(), + ]).unwrap(); + let df_b = DataFrame::new(vec![ + Series::new("id".into(), &["1"]).into_column(), + Series::new("v".into(), &[20i64]).into_column(), + ]).unwrap(); + + let opts = DiffOptions { key_columns: vec!["id".to_string()], ..Default::default() }; + let result = diff_sheets(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap(); + assert_eq!(result.modified.len(), 1); + } +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cargo test --lib diff::tests::test_diff_sheets` +Expected: FAIL + +- [ ] **Step 3: Implement `diff_sheets`** + +Add to `src/diff.rs`: + +```rust +/// Compare two DataFrames. Dispatches to positional or key-based mode. +pub fn diff_sheets( + df_a: &DataFrame, + df_b: &DataFrame, + opts: &DiffOptions, + source_a: SheetSource, + source_b: SheetSource, +) -> anyhow::Result<DiffResult> { + if opts.key_columns.is_empty() { + diff_positional(df_a, df_b, opts, source_a, source_b) + } else { + diff_keyed(df_a, df_b, opts, source_a, source_b) + } +} +``` + +- [ ] **Step 4: Run tests** + +Run: `cargo test --lib diff::tests` +Expected: all 12 tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/diff.rs +git commit -m "feat(xldiff): add diff_sheets entry point dispatching positional/keyed modes" +``` + +--- + +### Task 9: xldiff.rs — CLI skeleton + file:sheet parsing + +**Files:** +- Create: `src/bin/xldiff.rs` + +- [ ] **Step 1: Write the CLI binary with clap args and file:sheet parsing** + +Create `src/bin/xldiff.rs`: + +```rust +use std::io::IsTerminal; +use std::path::PathBuf; +use std::process; + +use anyhow::Result; +use clap::Parser; + +use xlcat::diff::{self, DiffOptions, DiffResult, SheetSource}; +use xlcat::filter; +use xlcat::metadata; +use xlcat::reader; + +#[derive(Parser)] +#[command( + name = "xldiff", + about = "Compare two Excel sheets and report differences", + version +)] +struct Args { + /// First file, optionally with :sheet suffix (e.g., old.xlsx:Sales) + file_a: String, + + /// Second file, optionally with :sheet suffix + file_b: String, + + /// Comma-separated key columns for row matching (name or column letter). + /// Omit for positional (whole-row) comparison. + #[arg(long)] + key: Option<String>, + + /// Comma-separated columns to compare (name or column letter). + /// Default: all columns. Key columns are always included. + #[arg(long)] + cols: Option<String>, + + /// Rows to skip before header. Single value for both files, or N,M for independent skips. + #[arg(long, default_value = "0")] + skip: String, + + /// Treat first row as data; generate synthetic column_0, column_1, … headers. + /// NOTE: declared for CLI compatibility but not yet implemented — follow-up task. + #[arg(long)] + no_header: bool, + + /// Numeric tolerance for float comparisons (e.g., 0.01) + #[arg(long)] + tolerance: Option<f64>, + + /// Output format: text (default), markdown, json, csv + #[arg(long, default_value = "text")] + format: String, + + /// Force-disable ANSI colors + #[arg(long)] + no_color: bool, +} + +/// Parse a file argument like "path.xlsx:Sheet1" into (path, Option<sheet>). +/// Uses rfind(':') to handle Windows drive letters (C:\...). +fn parse_file_arg(arg: &str) -> (PathBuf, Option<String>) { + if let Some(colon_pos) = arg.rfind(':') { + // Don't split if the colon is part of a Windows drive letter (e.g., C:\) + // A drive letter colon is at position 1 and followed by \ or / + if colon_pos == 1 && arg.len() > 2 && (arg.as_bytes()[2] == b'\\' || arg.as_bytes()[2] == b'/') { + return (PathBuf::from(arg), None); + } + // Don't split if nothing after the colon + if colon_pos == arg.len() - 1 { + return (PathBuf::from(&arg[..colon_pos]), None); + } + let path = PathBuf::from(&arg[..colon_pos]); + let sheet = arg[colon_pos + 1..].to_string(); + (path, Some(sheet)) + } else { + (PathBuf::from(arg), None) + } +} + +/// Parse the --skip flag: "3" or "3,5" +fn parse_skip(s: &str) -> Result<(usize, usize)> { + if let Some((a, b)) = s.split_once(',') { + let skip_a = a.trim().parse::<usize>().map_err(|_| anyhow::anyhow!("invalid skip value: {}", a))?; + let skip_b = b.trim().parse::<usize>().map_err(|_| anyhow::anyhow!("invalid skip value: {}", b))?; + Ok((skip_a, skip_b)) + } else { + let skip = s.trim().parse::<usize>().map_err(|_| anyhow::anyhow!("invalid skip value: {}", s))?; + Ok((skip, skip)) + } +} + +/// Resolve sheet name from user input or default to first sheet. +fn resolve_sheet(info: &metadata::FileInfo, sheet_arg: Option<&str>) -> Result<String> { + match sheet_arg { + None => info.sheets.first() + .map(|s| s.name.clone()) + .ok_or_else(|| anyhow::anyhow!("workbook has no sheets")), + Some(s) => { + // Try exact name match + if let Some(sheet) = info.sheets.iter().find(|si| si.name == s) { + return Ok(sheet.name.clone()); + } + // Try as 0-based index + if let Ok(idx) = s.parse::<usize>() { + if let Some(sheet) = info.sheets.get(idx) { + return Ok(sheet.name.clone()); + } + } + let names: Vec<_> = info.sheets.iter().map(|s| s.name.as_str()).collect(); + anyhow::bail!("sheet '{}' not found. Available: {}", s, names.join(", ")) + } + } +} + +fn run(args: Args) -> Result<()> { + let (path_a, sheet_sel_a) = parse_file_arg(&args.file_a); + let (path_b, sheet_sel_b) = parse_file_arg(&args.file_b); + + // Validate files exist + if !path_a.exists() { + anyhow::bail!("file not found: {}", path_a.display()); + } + if !path_b.exists() { + anyhow::bail!("file not found: {}", path_b.display()); + } + + // Validate format + let format = match args.format.as_str() { + "text" | "markdown" | "json" | "csv" => args.format.as_str(), + other => anyhow::bail!("unknown format '{}'. Use: text, markdown, json, csv", other), + }; + + // Parse skip + let (skip_a, skip_b) = parse_skip(&args.skip)?; + + // Resolve sheets + let info_a = metadata::read_file_info(&path_a)?; + let info_b = metadata::read_file_info(&path_b)?; + let sheet_a = resolve_sheet(&info_a, sheet_sel_a.as_deref())?; + let sheet_b = resolve_sheet(&info_b, sheet_sel_b.as_deref())?; + + // Read DataFrames + let df_a = if skip_a > 0 { + reader::read_sheet_with_skip(&path_a, &sheet_a, skip_a)? + } else { + reader::read_sheet(&path_a, &sheet_a)? + }; + let df_b = if skip_b > 0 { + reader::read_sheet_with_skip(&path_b, &sheet_b, skip_b)? + } else { + reader::read_sheet(&path_b, &sheet_b)? + }; + + // Resolve key columns (validate they exist in both files) + let key_columns: Vec<String> = if let Some(ref key_str) = args.key { + let specs: Vec<String> = key_str.split(',').map(|s| s.trim().to_string()).collect(); + let cols_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect(); + let cols_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect(); + let resolved = filter::resolve_columns(&specs, &cols_a) + .map_err(|e| anyhow::anyhow!("first file: {}", e))?; + // Validate key columns also exist in second file + for k in &resolved { + if !cols_b.iter().any(|c| c == k) { + anyhow::bail!("key column '{}' not found in second file", k); + } + } + resolved + } else { + vec![] + }; + + // Column filtering + let (df_a, df_b) = if let Some(ref cols_str) = args.cols { + let specs: Vec<String> = cols_str.split(',').map(|s| s.trim().to_string()).collect(); + let cols_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect(); + let mut selected = filter::resolve_columns(&specs, &cols_a) + .map_err(|e| anyhow::anyhow!(e))?; + // Ensure key columns are included + for k in &key_columns { + if !selected.contains(k) { + selected.insert(0, k.clone()); + } + } + let sel_refs: Vec<&str> = selected.iter().map(|s| s.as_str()).collect(); + let df_a = df_a.select(&sel_refs)?; + let df_b = df_b.select(&sel_refs)?; + (df_a, df_b) + } else { + (df_a, df_b) + }; + + let source_a = SheetSource { + file_name: path_a.file_name().unwrap_or_default().to_string_lossy().to_string(), + sheet_name: sheet_a, + }; + let source_b = SheetSource { + file_name: path_b.file_name().unwrap_or_default().to_string_lossy().to_string(), + sheet_name: sheet_b, + }; + + let opts = DiffOptions { + key_columns, + tolerance: args.tolerance, + }; + + let result = diff::diff_sheets(&df_a, &df_b, &opts, source_a, source_b)?; + + // Determine color + let use_color = !args.no_color && std::io::stdout().is_terminal(); + + // Format and print + let output = match format { + "text" => format_text(&result, use_color), + "markdown" => format_markdown(&result), + "json" => format_json(&result), + "csv" => format_csv(&result), + _ => unreachable!(), + }; + print!("{output}"); + + // Exit code + if result.has_differences() { + process::exit(1); + } + Ok(()) +} + +fn main() { + let args = Args::parse(); + if let Err(err) = run(args) { + eprintln!("xldiff: {err}"); + process::exit(2); + } +} + +// Output formatters — implemented in subsequent tasks +fn format_text(_result: &DiffResult, _color: bool) -> String { todo!() } +fn format_markdown(_result: &DiffResult) -> String { todo!() } +fn format_json(_result: &DiffResult) -> String { todo!() } +fn format_csv(_result: &DiffResult) -> String { todo!() } +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cargo check --bin xldiff` +Expected: success (todo!() stubs compile but will panic at runtime) + +- [ ] **Step 3: Commit** + +```bash +git add src/bin/xldiff.rs +git commit -m "feat(xldiff): add CLI skeleton with file:sheet parsing, skip, key/cols resolution" +``` + +--- + +### Task 10: xldiff.rs — text output formatter + +**Files:** +- Modify: `src/bin/xldiff.rs` + +- [ ] **Step 1: Implement `format_text`** + +Replace the `format_text` stub: + +```rust +fn format_text(result: &DiffResult, color: bool) -> String { + if !result.has_differences() { + return "No differences found.\n".to_string(); + } + + let mut out = String::new(); + + // Header + let (red, green, yellow, reset) = if color { + ("\x1b[31m", "\x1b[32m", "\x1b[33m", "\x1b[0m") + } else { + ("", "", "", "") + }; + + out.push_str(&format!( + "--- {} ({})\n+++ {} ({})\n\n", + result.source_a.sheet_name, result.source_a.file_name, + result.source_b.sheet_name, result.source_b.file_name, + )); + + // Summary + out.push_str(&format!( + "Added: {} | Removed: {} | Modified: {}\n\n", + result.added.len(), + result.removed.len(), + result.modified.len(), + )); + + // Removed rows + for row in &result.removed { + out.push_str(red); + out.push_str("- "); + out.push_str(&format_row_inline(&result.headers, &row.values)); + out.push_str(reset); + out.push('\n'); + } + + // Added rows + for row in &result.added { + out.push_str(green); + out.push_str("+ "); + out.push_str(&format_row_inline(&result.headers, &row.values)); + out.push_str(reset); + out.push('\n'); + } + + if (!result.removed.is_empty() || !result.added.is_empty()) && !result.modified.is_empty() { + out.push('\n'); + } + + // Modified rows + for m in &result.modified { + out.push_str(yellow); + out.push_str("~ "); + // Show key + let key_display: Vec<String> = result.key_columns.iter() + .zip(m.key.iter()) + .map(|(col, val)| format!("{}: \"{}\"", col, val)) + .collect(); + out.push_str(&key_display.join(" ")); + out.push_str(reset); + out.push('\n'); + for change in &m.changes { + out.push_str(yellow); + out.push_str(&format!(" {}: \"{}\" → \"{}\"", change.column, change.old_value, change.new_value)); + out.push_str(reset); + out.push('\n'); + } + } + + out +} + +/// Format a row as inline key-value pairs: Name: "Alice" Score: "90" +fn format_row_inline(headers: &[String], values: &[String]) -> String { + headers.iter() + .zip(values.iter()) + .map(|(h, v)| format!("{}: \"{}\"", h, v)) + .collect::<Vec<_>>() + .join(" ") +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cargo check --bin xldiff` +Expected: success + +- [ ] **Step 3: Commit** + +```bash +git add src/bin/xldiff.rs +git commit -m "feat(xldiff): implement colored text output formatter" +``` + +--- + +### Task 11: xldiff.rs — markdown output formatter + +**Files:** +- Modify: `src/bin/xldiff.rs` + +- [ ] **Step 1: Implement `format_markdown`** + +Replace the `format_markdown` stub: + +```rust +fn format_markdown(result: &DiffResult) -> String { + use xlcat::formatter; + + if !result.has_differences() { + return "No differences found.\n".to_string(); + } + + let mut out = String::new(); + + // Added + if !result.added.is_empty() { + out.push_str(&format!("## Added ({})\n", result.added.len())); + let rows: Vec<Vec<String>> = result.added.iter().map(|r| r.values.clone()).collect(); + out.push_str(&formatter::render_table(&result.headers, &rows)); + out.push('\n'); + } + + // Removed + if !result.removed.is_empty() { + out.push_str(&format!("## Removed ({})\n", result.removed.len())); + let rows: Vec<Vec<String>> = result.removed.iter().map(|r| r.values.clone()).collect(); + out.push_str(&formatter::render_table(&result.headers, &rows)); + out.push('\n'); + } + + // Modified + if !result.modified.is_empty() { + out.push_str(&format!("## Modified ({})\n", result.modified.len())); + let key_label = if result.key_columns.len() == 1 { + format!("Key ({})", result.key_columns[0]) + } else { + format!("Key ({})", result.key_columns.join(", ")) + }; + let mod_headers = vec![key_label, "Column".to_string(), "Old".to_string(), "New".to_string()]; + let mut mod_rows = Vec::new(); + for m in &result.modified { + let key_str = m.key.join(", "); + for (i, change) in m.changes.iter().enumerate() { + let key_cell = if i == 0 { key_str.clone() } else { String::new() }; + mod_rows.push(vec![ + key_cell, + change.column.clone(), + change.old_value.clone(), + change.new_value.clone(), + ]); + } + } + out.push_str(&formatter::render_table(&mod_headers, &mod_rows)); + out.push('\n'); + } + + out +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cargo check --bin xldiff` +Expected: success + +- [ ] **Step 3: Commit** + +```bash +git add src/bin/xldiff.rs +git commit -m "feat(xldiff): implement markdown output formatter" +``` + +--- + +### Task 12: xldiff.rs — JSON output formatter + +**Files:** +- Modify: `src/bin/xldiff.rs` + +- [ ] **Step 1: Implement `format_json`** + +Replace the `format_json` stub: + +```rust +fn format_json(result: &DiffResult) -> String { + use serde_json::{json, Map, Value}; + + let row_to_obj = |row: &diff::DiffRow| -> Value { + let mut map = Map::new(); + for (h, v) in result.headers.iter().zip(row.values.iter()) { + map.insert(h.clone(), Value::String(v.clone())); + } + Value::Object(map) + }; + + let added: Vec<Value> = result.added.iter().map(row_to_obj).collect(); + let removed: Vec<Value> = result.removed.iter().map(row_to_obj).collect(); + + let modified: Vec<Value> = result.modified.iter().map(|m| { + let mut key_map = Map::new(); + for (col, val) in result.key_columns.iter().zip(m.key.iter()) { + key_map.insert(col.clone(), Value::String(val.clone())); + } + let changes: Vec<Value> = m.changes.iter().map(|c| { + json!({ + "column": c.column, + "old": c.old_value, + "new": c.new_value, + }) + }).collect(); + json!({ + "key": Value::Object(key_map), + "changes": changes, + }) + }).collect(); + + let obj = json!({ + "added": added, + "removed": removed, + "modified": modified, + }); + + serde_json::to_string_pretty(&obj).unwrap() + "\n" +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cargo check --bin xldiff` +Expected: success + +- [ ] **Step 3: Commit** + +```bash +git add src/bin/xldiff.rs +git commit -m "feat(xldiff): implement JSON output formatter" +``` + +--- + +### Task 13: xldiff.rs — CSV output formatter + +**Files:** +- Modify: `src/bin/xldiff.rs` + +- [ ] **Step 1: Implement `format_csv`** + +Replace the `format_csv` stub: + +```rust +fn format_csv(result: &DiffResult) -> String { + let mut out = String::new(); + + // Header: _status, col1, col2, ..., _old_col1, _old_col2, ... + let mut header_parts = vec!["_status".to_string()]; + header_parts.extend(result.headers.clone()); + for h in &result.headers { + header_parts.push(format!("_old_{}", h)); + } + out.push_str(&csv_row(&header_parts)); + + // Added rows + for row in &result.added { + let mut parts = vec!["added".to_string()]; + parts.extend(row.values.clone()); + // Empty _old_ columns + for _ in &result.headers { + parts.push(String::new()); + } + out.push_str(&csv_row(&parts)); + } + + // Removed rows + for row in &result.removed { + let mut parts = vec!["removed".to_string()]; + parts.extend(row.values.clone()); + // Empty _old_ columns + for _ in &result.headers { + parts.push(String::new()); + } + out.push_str(&csv_row(&parts)); + } + + // Modified rows + for m in &result.modified { + let mut main_vals = vec![String::new(); result.headers.len()]; + let mut old_vals = vec![String::new(); result.headers.len()]; + + // Fill key columns in main values + for (key_col, key_val) in result.key_columns.iter().zip(m.key.iter()) { + if let Some(idx) = result.headers.iter().position(|h| h == key_col) { + main_vals[idx] = key_val.clone(); + } + } + + // Fill changed columns + for change in &m.changes { + if let Some(idx) = result.headers.iter().position(|h| h == &change.column) { + main_vals[idx] = change.new_value.clone(); + old_vals[idx] = change.old_value.clone(); + } + } + + let mut parts = vec!["modified".to_string()]; + parts.extend(main_vals); + parts.extend(old_vals); + out.push_str(&csv_row(&parts)); + } + + out +} + +/// Format a single CSV row with RFC 4180 quoting. +fn csv_row(values: &[String]) -> String { + let escaped: Vec<String> = values.iter().map(|v| { + if v.contains(',') || v.contains('"') || v.contains('\n') { + format!("\"{}\"", v.replace('"', "\"\"")) + } else { + v.clone() + } + }).collect(); + escaped.join(",") + "\n" +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cargo check --bin xldiff` +Expected: success + +- [ ] **Step 3: Commit** + +```bash +git add src/bin/xldiff.rs +git commit -m "feat(xldiff): implement CSV output formatter" +``` + +--- + +### Task 14: Test fixtures + +**Files:** +- Modify: `tests/common/mod.rs` + +- [ ] **Step 1: Add diff test fixtures** + +Append to `tests/common/mod.rs`: + +```rust +/// Create a pair of files for positional diff testing. +/// File A: Alice/90, Bob/80, Charlie/70 +/// File B: Alice/90, Charlie/70, Dana/85 (Bob removed, Dana added) +pub fn create_diff_pair(path_a: &Path, path_b: &Path) { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "Name").unwrap(); + ws.write_string(0, 1, "Score").unwrap(); + ws.write_string(1, 0, "Alice").unwrap(); + ws.write_number(1, 1, 90.0).unwrap(); + ws.write_string(2, 0, "Bob").unwrap(); + ws.write_number(2, 1, 80.0).unwrap(); + ws.write_string(3, 0, "Charlie").unwrap(); + ws.write_number(3, 1, 70.0).unwrap(); + wb.save(path_a).unwrap(); + + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "Name").unwrap(); + ws.write_string(0, 1, "Score").unwrap(); + ws.write_string(1, 0, "Alice").unwrap(); + ws.write_number(1, 1, 90.0).unwrap(); + ws.write_string(2, 0, "Charlie").unwrap(); + ws.write_number(2, 1, 70.0).unwrap(); + ws.write_string(3, 0, "Dana").unwrap(); + ws.write_number(3, 1, 85.0).unwrap(); + wb.save(path_b).unwrap(); +} + +/// Create a pair for key-based diff testing. +/// File A: ID=1/Score=90, ID=2/Score=80, ID=3/Score=70 +/// File B: ID=1/Score=95, ID=2/Score=80, ID=4/Score=85 +/// Expected: ID=1 modified (90→95), ID=3 removed, ID=4 added +pub fn create_diff_pair_with_keys(path_a: &Path, path_b: &Path) { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "ID").unwrap(); + ws.write_string(0, 1, "Name").unwrap(); + ws.write_string(0, 2, "Score").unwrap(); + ws.write_string(1, 0, "1").unwrap(); + ws.write_string(1, 1, "Alice").unwrap(); + ws.write_number(1, 2, 90.0).unwrap(); + ws.write_string(2, 0, "2").unwrap(); + ws.write_string(2, 1, "Bob").unwrap(); + ws.write_number(2, 2, 80.0).unwrap(); + ws.write_string(3, 0, "3").unwrap(); + ws.write_string(3, 1, "Charlie").unwrap(); + ws.write_number(3, 2, 70.0).unwrap(); + wb.save(path_a).unwrap(); + + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "ID").unwrap(); + ws.write_string(0, 1, "Name").unwrap(); + ws.write_string(0, 2, "Score").unwrap(); + ws.write_string(1, 0, "1").unwrap(); + ws.write_string(1, 1, "Alice").unwrap(); + ws.write_number(1, 2, 95.0).unwrap(); + ws.write_string(2, 0, "2").unwrap(); + ws.write_string(2, 1, "Bob").unwrap(); + ws.write_number(2, 2, 80.0).unwrap(); + ws.write_string(3, 0, "4").unwrap(); + ws.write_string(3, 1, "Dana").unwrap(); + ws.write_number(3, 2, 85.0).unwrap(); + wb.save(path_b).unwrap(); +} + +/// Create a pair for tolerance testing. +/// File A: ID=1/Price=100.001, ID=2/Price=200.5 +/// File B: ID=1/Price=100.002, ID=2/Price=200.6 +/// With tolerance 0.01: only ID=2 should be modified (diff=0.1 > 0.01) +pub fn create_diff_pair_with_floats(path_a: &Path, path_b: &Path) { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "ID").unwrap(); + ws.write_string(0, 1, "Price").unwrap(); + ws.write_string(1, 0, "1").unwrap(); + ws.write_number(1, 1, 100.001).unwrap(); + ws.write_string(2, 0, "2").unwrap(); + ws.write_number(2, 1, 200.5).unwrap(); + wb.save(path_a).unwrap(); + + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "ID").unwrap(); + ws.write_string(0, 1, "Price").unwrap(); + ws.write_string(1, 0, "1").unwrap(); + ws.write_number(1, 1, 100.002).unwrap(); + ws.write_string(2, 0, "2").unwrap(); + ws.write_number(2, 1, 200.6).unwrap(); + wb.save(path_b).unwrap(); +} +``` + +- [ ] **Step 2: Verify compilation** + +Run: `cargo test --no-run` +Expected: compiles successfully + +- [ ] **Step 3: Commit** + +```bash +git add tests/common/mod.rs +git commit -m "test(xldiff): add diff test fixtures — positional, keyed, and float tolerance pairs" +``` + +--- + +### Task 15: Integration tests + +**Files:** +- Create: `tests/test_xldiff.rs` + +- [ ] **Step 1: Write integration tests** + +Create `tests/test_xldiff.rs`: + +```rust +mod common; + +use assert_cmd::Command; +use predicates::prelude::*; +use tempfile::TempDir; + +#[test] +fn test_no_diff() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + // Use same data for both + common::create_simple(&path_a); + common::create_simple(&path_b); + + Command::cargo_bin("xldiff").unwrap() + .arg(path_a.to_str().unwrap()) + .arg(path_b.to_str().unwrap()) + .arg("--no-color") + .assert() + .success() // exit 0 + .stdout(predicate::str::contains("No differences found.")); +} + +#[test] +fn test_positional_diff() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair(&path_a, &path_b); + + Command::cargo_bin("xldiff").unwrap() + .arg(path_a.to_str().unwrap()) + .arg(path_b.to_str().unwrap()) + .arg("--no-color") + .assert() + .code(1) // differences found + .stdout(predicate::str::contains("Removed: 1")) + .stdout(predicate::str::contains("Added: 1")) + .stdout(predicate::str::contains("Bob")); +} + +#[test] +fn test_keyed_diff() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + Command::cargo_bin("xldiff").unwrap() + .arg(path_a.to_str().unwrap()) + .arg(path_b.to_str().unwrap()) + .arg("--key").arg("ID") + .arg("--no-color") + .assert() + .code(1) + .stdout(predicate::str::contains("Modified: 1")) + .stdout(predicate::str::contains("Removed: 1")) + .stdout(predicate::str::contains("Added: 1")); +} + +#[test] +fn test_tolerance() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_floats(&path_a, &path_b); + + // With tolerance=0.01, ID=1 diff (0.001) within tolerance, ID=2 diff (0.1) exceeds + Command::cargo_bin("xldiff").unwrap() + .arg(path_a.to_str().unwrap()) + .arg(path_b.to_str().unwrap()) + .arg("--key").arg("ID") + .arg("--tolerance").arg("0.01") + .arg("--no-color") + .assert() + .code(1) + .stdout(predicate::str::contains("Modified: 1")); +} + +#[test] +fn test_json_format() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + Command::cargo_bin("xldiff").unwrap() + .arg(path_a.to_str().unwrap()) + .arg(path_b.to_str().unwrap()) + .arg("--key").arg("ID") + .arg("--format").arg("json") + .assert() + .code(1) + .stdout(predicate::str::contains("\"added\"")) + .stdout(predicate::str::contains("\"removed\"")) + .stdout(predicate::str::contains("\"modified\"")); +} + +#[test] +fn test_markdown_format() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + Command::cargo_bin("xldiff").unwrap() + .arg(path_a.to_str().unwrap()) + .arg(path_b.to_str().unwrap()) + .arg("--key").arg("ID") + .arg("--format").arg("markdown") + .assert() + .code(1) + .stdout(predicate::str::contains("## Added")) + .stdout(predicate::str::contains("## Removed")) + .stdout(predicate::str::contains("## Modified")); +} + +#[test] +fn test_csv_format() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + Command::cargo_bin("xldiff").unwrap() + .arg(path_a.to_str().unwrap()) + .arg(path_b.to_str().unwrap()) + .arg("--key").arg("ID") + .arg("--format").arg("csv") + .assert() + .code(1) + .stdout(predicate::str::contains("_status")) + .stdout(predicate::str::contains("added")) + .stdout(predicate::str::contains("removed")) + .stdout(predicate::str::contains("modified")); +} + +#[test] +fn test_file_not_found() { + Command::cargo_bin("xldiff").unwrap() + .arg("nonexistent.xlsx") + .arg("also_nonexistent.xlsx") + .assert() + .code(2) + .stderr(predicate::str::contains("file not found")); +} + +#[test] +fn test_sheet_selector() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("multi.xlsx"); + common::create_multi_sheet(&path); + let path_str = path.to_str().unwrap(); + + // Compare Revenue vs Expenses sheets within same file + Command::cargo_bin("xldiff").unwrap() + .arg(format!("{}:Revenue", path_str)) + .arg(format!("{}:Expenses", path_str)) + .arg("--no-color") + .assert() + .code(1); // different data → exit 1 +} + +#[test] +fn test_cols_filter() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + // Only compare Name column (ignore Score), so ID=1 shows no modification + Command::cargo_bin("xldiff").unwrap() + .arg(path_a.to_str().unwrap()) + .arg(path_b.to_str().unwrap()) + .arg("--key").arg("ID") + .arg("--cols").arg("ID,Name") + .arg("--no-color") + .assert() + .code(1) + // ID=1 has same Name in both, so no modification for that row + .stdout(predicate::str::contains("Modified: 0").or(predicate::str::contains("Modified").not())); +} +``` + +- [ ] **Step 2: Run integration tests** + +Run: `cargo test --test test_xldiff` +Expected: all tests PASS + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_xldiff.rs +git commit -m "test(xldiff): add integration tests — all modes, formats, tolerance, errors" +``` + +--- + +### Task 16: Update CLAUDE.md and README + +**Files:** +- Modify: `CLAUDE.md` + +- [ ] **Step 1: Update CLAUDE.md** + +Update the "Current tools" section to include xldiff, remove it from "Related tools" and "Planned": + +In CLAUDE.md, update: +- Add `- **xldiff** — compare two sheets, report added/removed/modified rows` to "Current tools" +- Remove or update the "Related tools" xldiff entry +- Remove xlfilter from "Planned" if still there (it's already built) +- Update the Architecture line to mention the diff module + +- [ ] **Step 2: Commit** + +```bash +git add CLAUDE.md +git commit -m "docs: update CLAUDE.md — add xldiff to current tools, remove from planned" +``` + +--- + +### Task 17: Final verification + +- [ ] **Step 1: Run all tests** + +Run: `cargo test` +Expected: all tests pass (existing + new) + +- [ ] **Step 2: Run clippy** + +Run: `cargo clippy --all-targets` +Expected: no warnings + +- [ ] **Step 3: Build release binary** + +Run: `cargo build --release --bin xldiff` +Expected: binary created at `target/release/xldiff` + +- [ ] **Step 4: Manual smoke test** + +Run: `cargo run --bin xldiff -- demo/budget.xlsx demo/sales.xlsx --no-color` +Expected: shows differences between the two demo files + +- [ ] **Step 5: Commit any fixes** + +If clippy or tests revealed issues, fix and commit. diff --git a/docs/superpowers/specs/2026-03-17-xldiff-design.md b/docs/superpowers/specs/2026-03-17-xldiff-design.md @@ -0,0 +1,248 @@ +# xldiff — Excel Sheet Comparison Tool + +Port of [go-xldiff](../../../go-xldiff) to Rust, leveraging polars for type-aware comparison. + +## CLI Interface + +``` +xldiff <FILE1>[:<SHEET>] <FILE2>[:<SHEET>] [flags] +``` + +Two positional file arguments, each optionally suffixed with `:SheetName` or `:0` (0-based index). The same file may appear twice with different sheets (e.g., `report.xlsx:Q1 report.xlsx:Q2`). + +Sheet selection uses `rfind(':')` to split on the last colon, so Windows drive-letter paths (e.g., `C:\file.xlsx`) are handled correctly. The suffix is tried as a sheet name first, then as a 0-based numeric index. + +### Flags + +| Flag | Type | Default | Description | +|------|------|---------|-------------| +| `--key <COLS>` | string | *(none)* | Comma-separated key columns (header name or column letter). Omit for positional mode. | +| `--cols <COLS>` | string | *(all)* | Columns to compare (header name or column letter). Key columns always included. | +| `--skip <N>[,<M>]` | string | `0` | Rows to skip before header. Single value applies to both files; comma pair sets each independently. | +| `--no-header` | bool | false | Treat first row as data; generate synthetic `column_0`, `column_1`, … headers (matching reader.rs convention). | +| `--tolerance <F>` | f64 | *(none)* | Numeric tolerance for float comparisons (e.g., `0.01`). | +| `--format <FMT>` | string | `text` | Output format: `text`, `markdown`, `json`, `csv`. | +| `--no-color` | bool | false | Force-disable ANSI colors (auto-detected by default). | + +**Column resolution priority** (consistent with `filter.rs::resolve_column()`): +1. Exact header name match +2. Case-insensitive header name match +3. Column letter (A, B, AA) if purely alphabetic + +This means `--key A` matches a header named "A" before column position 0. To select the first column by position, use `--cols` with letter "A" only when no header is named "A". + +### Exit Codes + +xldiff follows the `diff(1)` convention, which differs from xlcat/xlset/xlfilter: + +- **0** — no differences +- **1** — differences found +- **2** — error (invalid arguments or runtime failure) + +The other tools in this repo use exit 1 for runtime errors and exit 2 for argument errors. xldiff merges both error types into exit 2 because exit 1 is reserved for "differences found" — the standard convention for diff tools, enabling `if ! xldiff ...; then ...` in scripts. + +## Data Structures + +```rust +pub struct DiffResult { + pub headers: Vec<String>, // Columns that were compared (filtered set if --cols used) + pub key_columns: Vec<String>, + pub added: Vec<DiffRow>, + pub removed: Vec<DiffRow>, + pub modified: Vec<ModifiedRow>, + pub source_a: SheetSource, + pub source_b: SheetSource, +} + +pub struct SheetSource { + pub file_name: String, + pub sheet_name: String, +} + +pub struct DiffRow { + pub values: Vec<String>, // Cell values as display strings +} + +pub struct ModifiedRow { + pub key: Vec<String>, + pub changes: Vec<CellChange>, +} + +pub struct CellChange { + pub column: String, + pub old_value: String, + pub new_value: String, +} +``` + +`DiffResult` is format-agnostic. Each output formatter consumes it independently. `SheetSource` carries file/sheet metadata for headers. + +Values are converted to display strings during diff construction. Type-aware comparison (tolerance, numeric equality) happens during the diff phase; the result stores only the string representations. + +`headers` reflects the effective column set after `--cols` filtering, so output formatters use it directly without further filtering. + +## Diff Algorithm + +### Mismatched Headers + +Before diffing, the tool checks whether both DataFrames share the same column names (after `--cols` filtering). Behavior: + +- **Key-based mode:** key columns must exist in both files or the tool exits with an error. Non-key columns are compared by name — columns present in only one file are reported as added/removed columns in a warning on stderr, and comparison proceeds on the intersection. `DiffResult.headers` contains the union. +- **Positional mode:** columns are matched by position. If column counts differ, the tool warns on stderr and pads the shorter side with empty values. + +### Positional Mode (no `--key`) + +1. Convert each row to a string: join all compared columns with a null-byte separator. +2. Build frequency maps (`HashMap<RowHash, usize>`) for both tables. +3. Walk table A: if B has remaining copies, consume one; otherwise mark as removed. +4. Walk table B: if A has remaining copies, consume one; otherwise mark as added. +5. No "modified" rows — any cell change produces a different hash. + +### Key-Based Mode (`--key` provided) + +The implementation uses HashMap-based row matching (like the Go version) rather than polars joins. This avoids complications with join column type mismatches, row reordering, and duplicate-key cartesian products. Polars is used for reading and column selection, not for the join itself. + +1. Resolve key column names/letters to column indices. +2. Build `HashMap<Vec<String>, RowData>` for both tables, keyed by the concatenated key column values. +3. For each key in A not in B → removed. +4. For each key in B not in A → added. +5. For each key in both → compare non-key columns cell by cell: + - **Numeric (Float64):** if `--tolerance` is set, equal when `|a - b| <= tolerance`; otherwise exact. NaN == NaN is treated as equal; NaN vs non-NaN is a difference. + - **Integer vs Float cross-type:** if one file infers a column as Int64 and the other as Float64, both values are compared as f64. Tolerance applies if set. + - **Integer (Int64):** exact comparison (tolerance does not apply to pure integer columns). + - **String:** exact string comparison. + - **Datetime:** compare as timestamps. + - **Boolean:** exact match. + - **Null vs value:** treated as different. Null vs null: equal. +6. Rows with any cell difference become a `ModifiedRow` with a list of `CellChange`s. +7. Duplicate keys produce a warning on stderr. Last occurrence wins (same as Go). + +### Column Filtering (`--cols`) + +Applied before diffing. Both DataFrames are narrowed to the selected columns. Key columns are always retained, even when omitted from `--cols`. + +### Tolerance + +- Applies to Float64 columns and Int64-vs-Float64 cross-type comparisons in key-based mode. +- Positional mode hashes exact string values; tolerance does not apply. +- If `--tolerance` is set but no numeric columns exist, it is a silent no-op. + +## Output Formats + +### No Differences + +When the diff result is empty (no added, removed, or modified rows): + +- **Text:** prints "No differences found." to stdout. +- **Markdown:** prints "No differences found." +- **JSON:** `{"added":[],"removed":[],"modified":[]}` +- **CSV:** header row only (no data rows). + +All formats exit 0. + +### Text (default, colored) + +``` +--- Sheet1 (old.xlsx) ++++ Sheet1 (new.xlsx) + +Added: 1 | Removed: 1 | Modified: 1 + +- ID: "3" Name: "Charlie" Score: "70" ++ ID: "4" Name: "Dana" Score: "85" + +~ ID: "1" + Score: "90" → "95" +``` + +ANSI colors: red (`-`), green (`+`), yellow (`~`). Auto-detects TTY via `std::io::IsTerminal`; `--no-color` overrides. + +### Markdown (`--format markdown`) + +```markdown +## Added (1) +| ID | Name | Score | +|----|------|-------| +| 4 | Dana | 85 | + +## Removed (1) +| ID | Name | Score | +|----|------|-------| +| 3 | Charlie | 70 | + +## Modified (1) +| Key (ID) | Column | Old | New | +|----------|--------|-----|-----| +| 1 | Score | 90 | 95 | +``` + +Clean, uncolored tables. The added/removed tables can reuse `formatter.rs::render_table_header()` and `render_table_rows()` for consistent alignment with xlcat/xlfilter output. The modified table is diff-specific. + +### JSON (`--format json`) + +```json +{ + "added": [{"ID": "4", "Name": "Dana", "Score": "85"}], + "removed": [{"ID": "3", "Name": "Charlie", "Score": "70"}], + "modified": [ + { + "key": {"ID": "1"}, + "changes": [{"column": "Score", "old": "90", "new": "95"}] + } + ] +} +``` + +Empty arrays are `[]`, never null. Pretty-printed with 2-space indentation. + +### CSV (`--format csv`) + +```csv +_status,ID,Name,Score,_old_ID,_old_Name,_old_Score +added,4,Dana,85,,, +removed,3,Charlie,70,,, +modified,1,,95,,,90 +``` + +`_status` column indicates row type. `_old_*` columns hold previous values. For modified rows: new values go in the main columns, old values in `_old_*` columns, only for changed cells. Unchanged cells are empty in both positions. Key column values appear in the main columns to identify the row. + +## Module Layout + +### New Files + +- **`src/diff.rs`** — Core diff logic: `DiffResult`, positional/key-based algorithms, tolerance, duplicate key detection. +- **`src/bin/xldiff.rs`** — CLI binary: clap argument parsing, sheet resolution, output formatting (text, markdown, json, csv). + +### Reused Modules + +- **`reader.rs`** — `read_sheet()` / `read_sheet_with_skip()` to load DataFrames. +- **`metadata.rs`** — `read_file_info()` for sheet resolution. +- **`cell.rs`** — Column letter resolution (shared with filter.rs via `col_letter_to_index()`). +- **`formatter.rs`** — `render_table_header()` and `render_table_rows()` for markdown added/removed tables. + +### Output Formatting + +The four format renderers live in the binary (`xldiff.rs`), not in `formatter.rs`. Diff output structure differs enough from xlcat's table rendering to warrant separation. The markdown formatter reuses `formatter.rs` table-rendering helpers for the added/removed sections. + +### Cargo.toml Changes + +- Add `[[bin]] name = "xldiff"` entry. +- Add `serde = { version = "1", features = ["derive"] }` and `serde_json = "1"` for JSON output. +- TTY detection via `std::io::IsTerminal` (stable since Rust 1.70, no extra crate needed). + +### Pipeline + +``` +parse args → resolve sheets → read DataFrames + → narrow to --cols → diff (positional or key-based) + → format output → print → exit code +``` + +## Testing + +- **Unit tests** in `diff.rs`: positional mode, key-based mode, tolerance (including cross-type Int64/Float64, NaN handling), duplicate keys, column filtering, edge cases (empty sheets, single-row, mismatched columns/headers). +- **Integration tests** in `tests/test_xldiff.rs`: CLI end-to-end with `assert_cmd`. +- **Fixtures**: add helpers to `tests/common/mod.rs`: + - `create_diff_pair()` — two files with added/removed/identical rows for positional mode. + - `create_diff_pair_with_keys()` — two files with key columns and modified cells. + - `create_diff_pair_with_floats()` — two files with float values differing by small amounts for tolerance testing. diff --git a/src/bin/xldiff.rs b/src/bin/xldiff.rs @@ -0,0 +1,826 @@ +use std::io::IsTerminal; +use std::path::PathBuf; +use std::process; + +use anyhow::{Result, bail}; +use clap::Parser; +use serde_json::{Map, Value, json}; + +use xlcat::diff::{DiffOptions, DiffResult, SheetSource}; +use xlcat::filter; +use xlcat::formatter; +use xlcat::metadata; +use xlcat::reader; + +#[derive(Parser)] +#[command( + name = "xldiff", + about = "Compare two Excel spreadsheets and show differences", + version +)] +struct Args { + /// First file (optionally file.xlsx:SheetName) + file_a: String, + + /// Second file (optionally file.xlsx:SheetName) + file_b: String, + + /// Key column(s) for matching rows (comma-separated names or letters) + #[arg(long)] + key: Option<String>, + + /// Columns to compare (comma-separated names or letters) + #[arg(long)] + cols: Option<String>, + + /// Rows to skip before header: single number or "skipA,skipB" + #[arg(long, default_value = "0")] + skip: String, + + /// Numeric tolerance for float comparisons + #[arg(long)] + tolerance: Option<f64>, + + /// Output format: text, markdown, json, csv + #[arg(long, default_value = "text")] + format: String, + + /// Disable colored output + #[arg(long)] + no_color: bool, +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Split a CLI argument like "file.xlsx:Sheet1" into (path, optional sheet). +/// +/// Handles Windows drive letters (e.g. C:\file.xlsx) by ignoring a colon +/// at position 1 when followed by `\` or `/`. A trailing colon with nothing +/// after it is treated as no sheet specification. +fn parse_file_arg(arg: &str) -> (PathBuf, Option<String>) { + // Find the *last* colon — that's the sheet separator. + if let Some(pos) = arg.rfind(':') { + // Skip Windows drive letters: colon at position 1 followed by \ or / + if pos == 1 { + let after = arg.as_bytes().get(2); + if after == Some(&b'\\') || after == Some(&b'/') { + return (PathBuf::from(arg), None); + } + } + let sheet_part = &arg[pos + 1..]; + if sheet_part.is_empty() { + // Trailing colon — ignore it + return (PathBuf::from(&arg[..pos]), None); + } + (PathBuf::from(&arg[..pos]), Some(sheet_part.to_string())) + } else { + (PathBuf::from(arg), None) + } +} + +/// Parse the --skip flag: either "3" (same skip for both) or "3,5". +fn parse_skip(s: &str) -> Result<(usize, usize)> { + if let Some((a, b)) = s.split_once(',') { + let skip_a: usize = a + .trim() + .parse() + .map_err(|_| anyhow::anyhow!("invalid skip value: '{}'", a.trim()))?; + let skip_b: usize = b + .trim() + .parse() + .map_err(|_| anyhow::anyhow!("invalid skip value: '{}'", b.trim()))?; + Ok((skip_a, skip_b)) + } else { + let skip: usize = s + .trim() + .parse() + .map_err(|_| anyhow::anyhow!("invalid skip value: '{}'", s.trim()))?; + Ok((skip, skip)) + } +} + +/// Resolve a sheet name: exact match, then 0-based index, then error. +fn resolve_sheet( + info: &metadata::FileInfo, + sheet_arg: Option<&str>, +) -> Result<String> { + match sheet_arg { + None => info + .sheets + .first() + .map(|s| s.name.clone()) + .ok_or_else(|| anyhow::anyhow!("workbook has no sheets")), + Some(s) => { + // Exact name match + if let Some(sheet) = info.sheets.iter().find(|si| si.name == s) { + return Ok(sheet.name.clone()); + } + // 0-based index + if let Ok(idx) = s.parse::<usize>() + && let Some(sheet) = info.sheets.get(idx) + { + return Ok(sheet.name.clone()); + } + let names: Vec<_> = info.sheets.iter().map(|si| si.name.as_str()).collect(); + bail!( + "sheet '{}' not found. Available sheets: {}", + s, + names.join(", ") + ) + } + } +} + +// --------------------------------------------------------------------------- +// Output formatters +// --------------------------------------------------------------------------- + +/// Format a row's values inline: `Name: "Alice" Score: "90"` +fn format_row_inline(headers: &[String], values: &[String]) -> String { + headers + .iter() + .zip(values.iter()) + .map(|(h, v)| format!("{}: \"{}\"", h, v)) + .collect::<Vec<_>>() + .join(" ") +} + +/// Format diff result as colored (or plain) text output. +fn format_text(result: &DiffResult, color: bool) -> String { + if !result.has_differences() { + return "No differences found.\n".to_string(); + } + + let (red, green, yellow, reset) = if color { + ("\x1b[31m", "\x1b[32m", "\x1b[33m", "\x1b[0m") + } else { + ("", "", "", "") + }; + + let mut out = String::new(); + + // Header + out.push_str(&format!( + "--- {} ({})\n+++ {} ({})\n\n", + result.source_a.sheet_name, + result.source_a.file_name, + result.source_b.sheet_name, + result.source_b.file_name, + )); + + // Summary + out.push_str(&format!( + "Added: {} | Removed: {} | Modified: {}\n\n", + result.added.len(), + result.removed.len(), + result.modified.len(), + )); + + // Removed rows + for row in &result.removed { + out.push_str(&format!( + "{}- {}{}", + red, + format_row_inline(&result.headers, &row.values), + reset, + )); + out.push('\n'); + } + + // Added rows + for row in &result.added { + out.push_str(&format!( + "{}+ {}{}", + green, + format_row_inline(&result.headers, &row.values), + reset, + )); + out.push('\n'); + } + + // Modified rows + for m in &result.modified { + // Build key display: Key: "value" Key2: "value2" + let key_display: Vec<String> = result + .key_columns + .iter() + .zip(m.key.iter()) + .map(|(col, val)| format!("{}: \"{}\"", col, val)) + .collect(); + out.push_str(&format!( + "{}~ {}{}", + yellow, + key_display.join(" "), + reset, + )); + out.push('\n'); + for change in &m.changes { + out.push_str(&format!( + " {}: \"{}\" \u{2192} \"{}\"\n", + change.column, change.old_value, change.new_value, + )); + } + } + + out +} + +/// Format diff result as markdown. +fn format_markdown(result: &DiffResult) -> String { + if !result.has_differences() { + return "No differences found.\n".to_string(); + } + + let mut out = String::new(); + + // Added + if !result.added.is_empty() { + out.push_str(&format!("## Added ({})\n\n", result.added.len())); + let rows: Vec<Vec<String>> = result + .added + .iter() + .map(|r| r.values.clone()) + .collect(); + out.push_str(&formatter::render_table(&result.headers, &rows)); + out.push('\n'); + } + + // Removed + if !result.removed.is_empty() { + out.push_str(&format!("## Removed ({})\n\n", result.removed.len())); + let rows: Vec<Vec<String>> = result + .removed + .iter() + .map(|r| r.values.clone()) + .collect(); + out.push_str(&formatter::render_table(&result.headers, &rows)); + out.push('\n'); + } + + // Modified + if !result.modified.is_empty() { + out.push_str(&format!("## Modified ({})\n\n", result.modified.len())); + + let key_label = if result.key_columns.len() == 1 { + format!("Key ({})", result.key_columns[0]) + } else { + format!("Key ({})", result.key_columns.join(", ")) + }; + let headers = vec![ + key_label, + "Column".to_string(), + "Old".to_string(), + "New".to_string(), + ]; + let mut rows: Vec<Vec<String>> = Vec::new(); + + for m in &result.modified { + let key_display = m.key.join(", "); + + for (i, change) in m.changes.iter().enumerate() { + let key_cell = if i == 0 { + key_display.clone() + } else { + String::new() + }; + rows.push(vec![ + key_cell, + change.column.clone(), + change.old_value.clone(), + change.new_value.clone(), + ]); + } + } + + out.push_str(&formatter::render_table(&headers, &rows)); + out.push('\n'); + } + + out +} + +/// Format diff result as JSON. +fn format_json(result: &DiffResult) -> String { + let added: Vec<Value> = result + .added + .iter() + .map(|row| { + let mut map = Map::new(); + for (h, v) in result.headers.iter().zip(row.values.iter()) { + map.insert(h.clone(), Value::String(v.clone())); + } + Value::Object(map) + }) + .collect(); + + let removed: Vec<Value> = result + .removed + .iter() + .map(|row| { + let mut map = Map::new(); + for (h, v) in result.headers.iter().zip(row.values.iter()) { + map.insert(h.clone(), Value::String(v.clone())); + } + Value::Object(map) + }) + .collect(); + + let modified: Vec<Value> = result + .modified + .iter() + .map(|m| { + let mut key_map = Map::new(); + for (col, val) in result.key_columns.iter().zip(m.key.iter()) { + key_map.insert(col.clone(), Value::String(val.clone())); + } + let changes: Vec<Value> = m + .changes + .iter() + .map(|c| { + json!({ + "column": c.column, + "old": c.old_value, + "new": c.new_value, + }) + }) + .collect(); + json!({ + "key": Value::Object(key_map), + "changes": changes, + }) + }) + .collect(); + + let output = json!({ + "added": added, + "removed": removed, + "modified": modified, + }); + + serde_json::to_string_pretty(&output).unwrap() + "\n" +} + +/// Quote a value per RFC 4180: if it contains comma, quote, or newline, wrap +/// in double quotes and escape any internal quotes by doubling them. +fn csv_quote(value: &str) -> String { + if value.contains(',') || value.contains('"') || value.contains('\n') { + format!("\"{}\"", value.replace('"', "\"\"")) + } else { + value.to_string() + } +} + +/// Build a CSV row from a slice of values. +fn csv_row(values: &[String]) -> String { + values.iter().map(|v| csv_quote(v)).collect::<Vec<_>>().join(",") +} + +/// Format diff result as CSV. +/// +/// Header: _status, col1, col2, ..., _old_col1, _old_col2, ... +/// Added rows: "added" + values + empty _old_ columns +/// Removed rows: "removed" + values + empty _old_ columns +/// Modified rows: "modified" + new values (key cols + changed new values) + old values in _old_ columns +fn format_csv(result: &DiffResult) -> String { + let mut out = String::new(); + + // Build header + let mut header_parts: Vec<String> = vec!["_status".to_string()]; + for h in &result.headers { + header_parts.push(h.clone()); + } + for h in &result.headers { + header_parts.push(format!("_old_{}", h)); + } + out.push_str(&csv_row(&header_parts)); + out.push('\n'); + + let empty_cols: Vec<String> = result.headers.iter().map(|_| String::new()).collect(); + + // Removed rows + for row in &result.removed { + let mut parts: Vec<String> = vec!["removed".to_string()]; + parts.extend(row.values.iter().cloned()); + // Pad if row has fewer values than headers + while parts.len() < 1 + result.headers.len() { + parts.push(String::new()); + } + parts.extend(empty_cols.iter().cloned()); + out.push_str(&csv_row(&parts)); + out.push('\n'); + } + + // Added rows + for row in &result.added { + let mut parts: Vec<String> = vec!["added".to_string()]; + parts.extend(row.values.iter().cloned()); + while parts.len() < 1 + result.headers.len() { + parts.push(String::new()); + } + parts.extend(empty_cols.iter().cloned()); + out.push_str(&csv_row(&parts)); + out.push('\n'); + } + + // Modified rows + for m in &result.modified { + let mut main_cols: Vec<String> = Vec::new(); + let mut old_cols: Vec<String> = Vec::new(); + + for h in &result.headers { + // Check if this is a key column + if let Some(key_idx) = result.key_columns.iter().position(|k| k == h) { + main_cols.push(m.key.get(key_idx).cloned().unwrap_or_default()); + old_cols.push(String::new()); + } else if let Some(change) = m.changes.iter().find(|c| c.column == *h) { + main_cols.push(change.new_value.clone()); + old_cols.push(change.old_value.clone()); + } else { + // Unchanged non-key column — leave empty in both + main_cols.push(String::new()); + old_cols.push(String::new()); + } + } + + let mut parts: Vec<String> = vec!["modified".to_string()]; + parts.extend(main_cols); + parts.extend(old_cols); + out.push_str(&csv_row(&parts)); + out.push('\n'); + } + + out +} + +// --------------------------------------------------------------------------- +// run / main +// --------------------------------------------------------------------------- + +fn run(args: Args) -> Result<()> { + // Parse file arguments + let (path_a, sheet_arg_a) = parse_file_arg(&args.file_a); + let (path_b, sheet_arg_b) = parse_file_arg(&args.file_b); + + // Validate files exist + if !path_a.exists() { + bail!("file not found: {}", path_a.display()); + } + if !path_b.exists() { + bail!("file not found: {}", path_b.display()); + } + + // Validate format + let format = args.format.to_lowercase(); + if !["text", "markdown", "json", "csv"].contains(&format.as_str()) { + bail!( + "unknown format '{}'. Use: text, markdown, json, csv", + args.format + ); + } + + // Parse skip + let (skip_a, skip_b) = parse_skip(&args.skip)?; + + // Read file info and resolve sheets + let info_a = metadata::read_file_info(&path_a)?; + let info_b = metadata::read_file_info(&path_b)?; + + let sheet_a = resolve_sheet(&info_a, sheet_arg_a.as_deref())?; + let sheet_b = resolve_sheet(&info_b, sheet_arg_b.as_deref())?; + + // Read DataFrames + let df_a = if skip_a > 0 { + reader::read_sheet_with_skip(&path_a, &sheet_a, skip_a)? + } else { + reader::read_sheet(&path_a, &sheet_a)? + }; + let df_b = if skip_b > 0 { + reader::read_sheet_with_skip(&path_b, &sheet_b, skip_b)? + } else { + reader::read_sheet(&path_b, &sheet_b)? + }; + + // Resolve key columns + let key_columns: Vec<String> = if let Some(ref key_str) = args.key { + let specs: Vec<String> = key_str.split(',').map(|s| s.trim().to_string()).collect(); + + let df_a_cols: Vec<String> = df_a + .get_column_names() + .iter() + .map(|s| s.to_string()) + .collect(); + let df_b_cols: Vec<String> = df_b + .get_column_names() + .iter() + .map(|s| s.to_string()) + .collect(); + + let resolved_a = filter::resolve_columns(&specs, &df_a_cols) + .map_err(|e| anyhow::anyhow!("first file: {}", e))?; + let resolved_b = filter::resolve_columns(&specs, &df_b_cols) + .map_err(|e| anyhow::anyhow!("second file: {}", e))?; + + // Validate keys match between both files + if resolved_a != resolved_b { + bail!( + "key columns resolve to different names: {:?} vs {:?}", + resolved_a, + resolved_b + ); + } + resolved_a + } else { + vec![] + }; + + // Column filtering with --cols + let (df_a, df_b) = if let Some(ref cols_str) = args.cols { + let specs: Vec<String> = cols_str.split(',').map(|s| s.trim().to_string()).collect(); + + let df_a_cols: Vec<String> = df_a + .get_column_names() + .iter() + .map(|s| s.to_string()) + .collect(); + let df_b_cols: Vec<String> = df_b + .get_column_names() + .iter() + .map(|s| s.to_string()) + .collect(); + + let mut selected_a = filter::resolve_columns(&specs, &df_a_cols) + .map_err(|e| anyhow::anyhow!(e))?; + let mut selected_b = filter::resolve_columns(&specs, &df_b_cols) + .map_err(|e| anyhow::anyhow!(e))?; + + // Ensure key columns are included + for key in &key_columns { + if !selected_a.contains(key) { + selected_a.insert(0, key.clone()); + } + if !selected_b.contains(key) { + selected_b.insert(0, key.clone()); + } + } + + let df_a = df_a.select(selected_a.iter().map(|s| s.as_str()))?; + let df_b = df_b.select(selected_b.iter().map(|s| s.as_str()))?; + (df_a, df_b) + } else { + (df_a, df_b) + }; + + // Build sources and options + let file_name_a = path_a + .file_name() + .map(|s| s.to_string_lossy().to_string()) + .unwrap_or_else(|| args.file_a.clone()); + let file_name_b = path_b + .file_name() + .map(|s| s.to_string_lossy().to_string()) + .unwrap_or_else(|| args.file_b.clone()); + + let source_a = SheetSource { + file_name: file_name_a, + sheet_name: sheet_a, + }; + let source_b = SheetSource { + file_name: file_name_b, + sheet_name: sheet_b, + }; + + let opts = DiffOptions { + key_columns, + tolerance: args.tolerance, + }; + + // Run diff + let result = xlcat::diff::diff_sheets(&df_a, &df_b, &opts, source_a, source_b)?; + + // TTY detection for color + let use_color = !args.no_color && std::io::stdout().is_terminal(); + + // Format output + let output = match format.as_str() { + "text" => format_text(&result, use_color), + "markdown" => format_markdown(&result), + "json" => format_json(&result), + "csv" => format_csv(&result), + _ => unreachable!(), + }; + + print!("{}", output); + + // Exit 1 if differences found (diff convention) + if result.has_differences() { + process::exit(1); + } + + Ok(()) +} + +fn main() { + let args = Args::parse(); + if let Err(err) = run(args) { + eprintln!("xldiff: {err}"); + process::exit(2); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use xlcat::diff::{CellChange, DiffRow, ModifiedRow}; + + // -- parse_file_arg -- + + #[test] + fn test_parse_file_arg_no_sheet() { + let (path, sheet) = parse_file_arg("data.xlsx"); + assert_eq!(path, PathBuf::from("data.xlsx")); + assert_eq!(sheet, None); + } + + #[test] + fn test_parse_file_arg_with_sheet() { + let (path, sheet) = parse_file_arg("data.xlsx:Revenue"); + assert_eq!(path, PathBuf::from("data.xlsx")); + assert_eq!(sheet, Some("Revenue".to_string())); + } + + #[test] + fn test_parse_file_arg_trailing_colon() { + let (path, sheet) = parse_file_arg("data.xlsx:"); + assert_eq!(path, PathBuf::from("data.xlsx")); + assert_eq!(sheet, None); + } + + #[test] + fn test_parse_file_arg_windows_drive() { + let (path, sheet) = parse_file_arg("C:\\Users\\file.xlsx"); + assert_eq!(path, PathBuf::from("C:\\Users\\file.xlsx")); + assert_eq!(sheet, None); + } + + #[test] + fn test_parse_file_arg_windows_drive_with_sheet() { + let (path, sheet) = parse_file_arg("C:\\Users\\file.xlsx:Sheet2"); + assert_eq!(path, PathBuf::from("C:\\Users\\file.xlsx")); + assert_eq!(sheet, Some("Sheet2".to_string())); + } + + // -- parse_skip -- + + #[test] + fn test_parse_skip_single() { + assert_eq!(parse_skip("3").unwrap(), (3, 3)); + } + + #[test] + fn test_parse_skip_pair() { + assert_eq!(parse_skip("3,5").unwrap(), (3, 5)); + } + + #[test] + fn test_parse_skip_zero() { + assert_eq!(parse_skip("0").unwrap(), (0, 0)); + } + + #[test] + fn test_parse_skip_invalid() { + assert!(parse_skip("abc").is_err()); + } + + // -- csv_quote -- + + #[test] + fn test_csv_quote_plain() { + assert_eq!(csv_quote("hello"), "hello"); + } + + #[test] + fn test_csv_quote_comma() { + assert_eq!(csv_quote("a,b"), "\"a,b\""); + } + + #[test] + fn test_csv_quote_quotes() { + assert_eq!(csv_quote("say \"hi\""), "\"say \"\"hi\"\"\""); + } + + // -- format_text -- + + #[test] + fn test_format_text_no_diff() { + let result = DiffResult { + headers: vec!["a".into()], + key_columns: vec![], + added: vec![], + removed: vec![], + modified: vec![], + source_a: SheetSource { + file_name: "a.xlsx".into(), + sheet_name: "Sheet1".into(), + }, + source_b: SheetSource { + file_name: "b.xlsx".into(), + sheet_name: "Sheet1".into(), + }, + }; + assert_eq!(format_text(&result, false), "No differences found.\n"); + } + + #[test] + fn test_format_text_with_changes() { + let result = DiffResult { + headers: vec!["id".into(), "name".into()], + key_columns: vec!["id".into()], + added: vec![DiffRow { + values: vec!["3".into(), "Charlie".into()], + }], + removed: vec![DiffRow { + values: vec!["1".into(), "Alice".into()], + }], + modified: vec![ModifiedRow { + key: vec!["2".into()], + changes: vec![CellChange { + column: "name".into(), + old_value: "Bob".into(), + new_value: "Robert".into(), + }], + }], + source_a: SheetSource { + file_name: "a.xlsx".into(), + sheet_name: "Sheet1".into(), + }, + source_b: SheetSource { + file_name: "b.xlsx".into(), + sheet_name: "Sheet1".into(), + }, + }; + let text = format_text(&result, false); + assert!(text.contains("--- Sheet1 (a.xlsx)")); + assert!(text.contains("+++ Sheet1 (b.xlsx)")); + assert!(text.contains("Added: 1")); + assert!(text.contains("Removed: 1")); + assert!(text.contains("Modified: 1")); + assert!(text.contains("- id: \"1\" name: \"Alice\"")); + assert!(text.contains("+ id: \"3\" name: \"Charlie\"")); + assert!(text.contains("~ id: \"2\"")); + assert!(text.contains("name: \"Bob\" \u{2192} \"Robert\"")); + } + + // -- format_json -- + + #[test] + fn test_format_json_structure() { + let result = DiffResult { + headers: vec!["id".into(), "val".into()], + key_columns: vec!["id".into()], + added: vec![DiffRow { + values: vec!["2".into(), "new".into()], + }], + removed: vec![], + modified: vec![], + source_a: SheetSource { + file_name: "a.xlsx".into(), + sheet_name: "S1".into(), + }, + source_b: SheetSource { + file_name: "b.xlsx".into(), + sheet_name: "S1".into(), + }, + }; + let json_str = format_json(&result); + let parsed: Value = serde_json::from_str(&json_str).unwrap(); + assert_eq!(parsed["added"][0]["id"], "2"); + assert_eq!(parsed["added"][0]["val"], "new"); + assert!(parsed["removed"].as_array().unwrap().is_empty()); + } + + // -- format_csv -- + + #[test] + fn test_format_csv_header() { + let result = DiffResult { + headers: vec!["id".into(), "name".into()], + key_columns: vec!["id".into()], + added: vec![], + removed: vec![], + modified: vec![], + source_a: SheetSource { + file_name: "a.xlsx".into(), + sheet_name: "S".into(), + }, + source_b: SheetSource { + file_name: "b.xlsx".into(), + sheet_name: "S".into(), + }, + }; + let csv = format_csv(&result); + let first_line = csv.lines().next().unwrap(); + assert_eq!(first_line, "_status,id,name,_old_id,_old_name"); + } +} diff --git a/src/bin/xlfilter.rs b/src/bin/xlfilter.rs @@ -0,0 +1,207 @@ +use std::path::PathBuf; +use std::process; + +use anyhow::Result; +use clap::Parser; + +use xlcat::filter::{ + parse_filter_expr, parse_sort_spec, filter_pipeline, FilterOptions, +}; +use xlcat::formatter; +use xlcat::metadata; +use xlcat::metadata::SheetInfo; +use xlcat::reader; + +#[derive(Parser)] +#[command( + name = "xlfilter", + about = "Filter and query Excel spreadsheet data", + version +)] +struct Args { + /// Path to .xls or .xlsx file + file: PathBuf, + + /// Select columns by letter (A,B,D) or header name (State,Amount) + #[arg(long)] + cols: Option<String>, + + /// Filter rows. Multiple = AND. Operators: = != > < >= <= ~ (contains, case-insensitive) !~ (not contains) + #[arg(long = "where")] + filters: Vec<String>, + + /// Sort by column. Format: col:dir (dir = asc or desc, default asc) + #[arg(long)] + sort: Option<String>, + + /// Max rows in output (applied after filtering) + #[arg(long)] + limit: Option<usize>, + + /// First N rows (applied before filtering) + #[arg(long)] + head: Option<usize>, + + /// Last N rows (applied before filtering) + #[arg(long)] + tail: Option<usize>, + + /// Target sheet by name or 0-based index (default: first sheet) + #[arg(long)] + sheet: Option<String>, + + /// Output as CSV instead of markdown table + #[arg(long)] + csv: bool, + + /// Skip first N data rows (for metadata/title rows above the real header) + #[arg(long)] + skip: Option<usize>, +} + +// --------------------------------------------------------------------------- +// ArgError — used for user-facing flag/argument errors (exit code 2) +// --------------------------------------------------------------------------- + +#[derive(Debug)] +struct ArgError(String); + +impl std::fmt::Display for ArgError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::error::Error for ArgError {} + +fn run(args: Args) -> Result<()> { + // Validate file exists + if !args.file.exists() { + anyhow::bail!(ArgError(format!( + "file not found: {}", + args.file.display() + ))); + } + + // Validate mutually exclusive flags + if args.head.is_some() && args.tail.is_some() { + anyhow::bail!(ArgError( + "--head and --tail are mutually exclusive".to_string() + )); + } + + // Read file metadata to resolve sheet name + let info = metadata::read_file_info(&args.file)?; + let sheet_name = resolve_sheet(&info, args.sheet.as_deref())?; + + // Read sheet into DataFrame + let df = if let Some(skip) = args.skip { + reader::read_sheet_with_skip(&args.file, &sheet_name, skip)? + } else { + reader::read_sheet(&args.file, &sheet_name)? + }; + + if df.height() == 0 { + eprintln!("0 rows"); + let sheet_info = info + .sheets + .iter() + .find(|s| s.name == sheet_name) + .cloned() + .unwrap_or(SheetInfo { + name: sheet_name, + rows: 0, + cols: 0, + }); + println!("{}", formatter::format_empty_sheet(&sheet_info)); + return Ok(()); + } + + // Parse filter expressions + let filters: Vec<_> = args + .filters + .iter() + .map(|s| parse_filter_expr(s)) + .collect::<Result<Vec<_>, _>>() + .map_err(|e| anyhow::anyhow!(ArgError(e)))?; + + // Parse sort spec + let sort = args + .sort + .as_deref() + .map(parse_sort_spec) + .transpose() + .map_err(|e| anyhow::anyhow!(ArgError(e)))?; + + // Parse column selection + let cols = args.cols.map(|s| { + s.split(',') + .map(|c| c.trim().to_string()) + .collect::<Vec<_>>() + }); + + // Run pipeline + let opts = FilterOptions { + filters, + cols, + sort, + limit: args.limit, + head: args.head, + tail: args.tail, + }; + let result = filter_pipeline(df, &opts)?; + + // Output row count to stderr + eprintln!("{} rows", result.height()); + + // Format output + if result.height() == 0 { + println!("{}", formatter::format_data_table(&result)); + } else if args.csv { + print!("{}", formatter::format_csv(&result)); + } else { + println!("{}", formatter::format_data_table(&result)); + } + + Ok(()) +} + +/// Resolve sheet name from --sheet flag or default to first sheet. +fn resolve_sheet(info: &metadata::FileInfo, sheet_arg: Option<&str>) -> Result<String> { + match sheet_arg { + None => { + info.sheets + .first() + .map(|s| s.name.clone()) + .ok_or_else(|| anyhow::anyhow!("workbook has no sheets")) + } + Some(s) => { + if let Some(sheet) = info.sheets.iter().find(|si| si.name == s) { + return Ok(sheet.name.clone()); + } + if let Ok(idx) = s.parse::<usize>() { + if let Some(sheet) = info.sheets.get(idx) { + return Ok(sheet.name.clone()); + } + } + let names: Vec<_> = info.sheets.iter().map(|s| s.name.as_str()).collect(); + anyhow::bail!(ArgError(format!( + "sheet '{}' not found. Available sheets: {}", + s, + names.join(", ") + ))) + } + } +} + +fn main() { + let args = Args::parse(); + if let Err(err) = run(args) { + if err.downcast_ref::<ArgError>().is_some() { + eprintln!("xlfilter: {err}"); + process::exit(2); + } + eprintln!("xlfilter: {err}"); + process::exit(1); + } +} diff --git a/src/diff.rs b/src/diff.rs @@ -0,0 +1,830 @@ +// Diff engine for comparing two Excel sheets. + +use anyhow::{Result, bail}; +use polars::prelude::*; +use std::collections::HashMap; + +use crate::formatter; + +/// Source file and sheet metadata for display. +#[derive(Debug, Clone)] +pub struct SheetSource { + pub file_name: String, + pub sheet_name: String, +} + +/// A single row from an added or removed set. +#[derive(Debug, Clone)] +pub struct DiffRow { + pub values: Vec<String>, +} + +/// A change in a single cell. +#[derive(Debug, Clone)] +pub struct CellChange { + pub column: String, + pub old_value: String, + pub new_value: String, +} + +/// A row present in both files with cell-level differences. +#[derive(Debug, Clone)] +pub struct ModifiedRow { + pub key: Vec<String>, + pub changes: Vec<CellChange>, +} + +/// Result of comparing two sheets. +#[derive(Debug, Clone)] +pub struct DiffResult { + pub headers: Vec<String>, + pub key_columns: Vec<String>, + pub added: Vec<DiffRow>, + pub removed: Vec<DiffRow>, + pub modified: Vec<ModifiedRow>, + pub source_a: SheetSource, + pub source_b: SheetSource, +} + +impl DiffResult { + pub fn has_differences(&self) -> bool { + !self.added.is_empty() || !self.removed.is_empty() || !self.modified.is_empty() + } +} + +/// Options controlling how the diff is performed. +#[derive(Debug, Clone, Default)] +pub struct DiffOptions { + pub key_columns: Vec<String>, + pub tolerance: Option<f64>, +} + +// --------------------------------------------------------------------------- +// Helper functions +// --------------------------------------------------------------------------- + +/// Format a cell value for display. Returns empty string for null. +fn cell_to_string(col: &Column, idx: usize) -> String { + match col.get(idx) { + Ok(AnyValue::Null) | Err(_) => String::new(), + Ok(v) => formatter::format_any_value(&v), + } +} + +/// Format a cell value for hashing. Uses a sentinel for null so that null +/// and empty string produce different keys. +fn cell_to_key_part(col: &Column, idx: usize) -> String { + match col.get(idx) { + Ok(AnyValue::Null) | Err(_) => "\x01NULL\x01".to_string(), + Ok(v) => formatter::format_any_value(&v), + } +} + +/// Build a string key for an entire row by joining all column values. +fn row_to_key(df: &DataFrame, row_idx: usize) -> String { + df.get_columns() + .iter() + .map(|col| cell_to_key_part(col, row_idx)) + .collect::<Vec<_>>() + .join("\0") +} + +/// Collect display values for every column in a row. +fn row_to_strings(df: &DataFrame, row_idx: usize) -> Vec<String> { + df.get_columns() + .iter() + .map(|col| cell_to_string(col, row_idx)) + .collect() +} + +// --------------------------------------------------------------------------- +// Positional diff +// --------------------------------------------------------------------------- + +/// Compare two DataFrames positionally (no key columns). +/// +/// Uses multiset comparison: each unique row is tracked by frequency. +/// Rows present in A but not (or fewer times) in B are "removed"; +/// rows present in B but not (or fewer times) in A are "added". +pub fn diff_positional( + df_a: &DataFrame, + df_b: &DataFrame, + _opts: &DiffOptions, + source_a: SheetSource, + source_b: SheetSource, +) -> Result<DiffResult> { + // Determine headers — use the longer header set. + let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect(); + let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect(); + + let headers = if headers_b.len() > headers_a.len() { + if headers_a.len() != headers_b.len() { + eprintln!( + "Warning: column count differs ({} vs {}), using wider header set", + headers_a.len(), + headers_b.len() + ); + } + headers_b.clone() + } else { + if headers_a.len() != headers_b.len() { + eprintln!( + "Warning: column count differs ({} vs {}), using wider header set", + headers_a.len(), + headers_b.len() + ); + } + headers_a.clone() + }; + + let num_headers = headers.len(); + + // Build frequency maps: key → list of row indices (so we can consume them). + let mut freq_a: HashMap<String, Vec<usize>> = HashMap::new(); + for i in 0..df_a.height() { + let key = row_to_key(df_a, i); + freq_a.entry(key).or_default().push(i); + } + + let mut freq_b: HashMap<String, Vec<usize>> = HashMap::new(); + for i in 0..df_b.height() { + let key = row_to_key(df_b, i); + freq_b.entry(key).or_default().push(i); + } + + let mut removed = Vec::new(); + let mut added = Vec::new(); + + // Walk A: for each row, try to consume a matching row from B. + for i in 0..df_a.height() { + let key = row_to_key(df_a, i); + let consumed = freq_b + .get_mut(&key) + .and_then(|indices| indices.pop()) + .is_some(); + if !consumed { + let mut vals = row_to_strings(df_a, i); + vals.resize(num_headers, String::new()); + removed.push(DiffRow { values: vals }); + } + } + + // Walk B: for each row, try to consume a matching row from A. + for i in 0..df_b.height() { + let key = row_to_key(df_b, i); + let consumed = freq_a + .get_mut(&key) + .and_then(|indices| indices.pop()) + .is_some(); + if !consumed { + let mut vals = row_to_strings(df_b, i); + vals.resize(num_headers, String::new()); + added.push(DiffRow { values: vals }); + } + } + + Ok(DiffResult { + headers, + key_columns: vec![], + added, + removed, + modified: vec![], + source_a, + source_b, + }) +} + +// --------------------------------------------------------------------------- +// Key-based diff +// --------------------------------------------------------------------------- + +/// A row indexed by its key columns. +struct KeyedRow { + values: Vec<String>, + key_values: Vec<String>, +} + +/// Build a map from composite key string to KeyedRow for every row in the DataFrame. +fn build_key_map( + df: &DataFrame, + key_indices: &[usize], + columns: &[Column], +) -> HashMap<String, KeyedRow> { + let mut map = HashMap::new(); + for i in 0..df.height() { + let key_values: Vec<String> = key_indices + .iter() + .map(|&ki| cell_to_string(&columns[ki], i)) + .collect(); + let composite_key = key_values.join("\0"); + let values: Vec<String> = columns.iter().map(|col| cell_to_string(col, i)).collect(); + map.insert( + composite_key, + KeyedRow { + values, + key_values, + }, + ); + } + map +} + +/// Warn on stderr when duplicate keys are found. +fn check_duplicate_keys( + df: &DataFrame, + key_indices: &[usize], + columns: &[Column], + source: &SheetSource, +) { + let mut seen: HashMap<String, usize> = HashMap::new(); + for i in 0..df.height() { + let key: String = key_indices + .iter() + .map(|&ki| cell_to_string(&columns[ki], i)) + .collect::<Vec<_>>() + .join("\0"); + let count = seen.entry(key.clone()).or_insert(0); + *count += 1; + if *count == 2 { + let display_key = key.replace('\0', ", "); + eprintln!( + "Warning: duplicate key [{}] in {}:{}", + display_key, source.file_name, source.sheet_name + ); + } + } +} + +/// Check whether a polars DataType is a float type. +fn is_float_dtype(dt: &DataType) -> bool { + matches!(dt, DataType::Float32 | DataType::Float64) +} + +/// Check whether a polars DataType is an integer type. +fn is_int_dtype(dt: &DataType) -> bool { + matches!( + dt, + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + ) +} + +/// Compare two string-rendered values with optional numeric tolerance. +/// +/// Rules: +/// - NaN == NaN is true. +/// - NaN vs non-NaN is false. +/// - Pure int+int columns use exact comparison (no tolerance applied). +/// - At least one float column applies tolerance. +/// - Otherwise exact string comparison. +fn values_equal_with_tolerance( + val_a: &str, + val_b: &str, + tolerance: f64, + df_a: &DataFrame, + df_b: &DataFrame, + col_name: &str, +) -> bool { + let parsed_a = val_a.parse::<f64>(); + let parsed_b = val_b.parse::<f64>(); + + match (parsed_a, parsed_b) { + (Ok(a), Ok(b)) => { + if a.is_nan() && b.is_nan() { + return true; + } + if a.is_nan() || b.is_nan() { + return false; + } + + let dt_a = df_a + .column(col_name) + .map(|c| c.dtype().clone()) + .unwrap_or(DataType::String); + let dt_b = df_b + .column(col_name) + .map(|c| c.dtype().clone()) + .unwrap_or(DataType::String); + + if is_int_dtype(&dt_a) && is_int_dtype(&dt_b) { + val_a == val_b + } else if is_float_dtype(&dt_a) || is_float_dtype(&dt_b) { + (a - b).abs() <= tolerance + } else { + val_a == val_b + } + } + _ => val_a == val_b, + } +} + +/// Compare non-key columns of two keyed rows and return cell-level changes. +#[allow(clippy::too_many_arguments)] +fn compare_rows( + df_a: &DataFrame, + df_b: &DataFrame, + headers_a: &[String], + headers_b: &[String], + row_a: &KeyedRow, + row_b: &KeyedRow, + common_columns: &[String], + opts: &DiffOptions, +) -> Vec<CellChange> { + let mut changes = Vec::new(); + for col_name in common_columns { + let idx_a = headers_a.iter().position(|h| h == col_name); + let idx_b = headers_b.iter().position(|h| h == col_name); + let val_a = idx_a + .map(|i| row_a.values.get(i).cloned().unwrap_or_default()) + .unwrap_or_default(); + let val_b = idx_b + .map(|i| row_b.values.get(i).cloned().unwrap_or_default()) + .unwrap_or_default(); + + let equal = if let Some(tol) = opts.tolerance { + values_equal_with_tolerance(&val_a, &val_b, tol, df_a, df_b, col_name) + } else { + val_a == val_b + }; + + if !equal { + changes.push(CellChange { + column: col_name.clone(), + old_value: val_a, + new_value: val_b, + }); + } + } + changes +} + +/// Compare two DataFrames using key columns. +pub fn diff_keyed( + df_a: &DataFrame, + df_b: &DataFrame, + opts: &DiffOptions, + source_a: SheetSource, + source_b: SheetSource, +) -> Result<DiffResult> { + let columns_a = df_a.get_columns(); + let columns_b = df_b.get_columns(); + let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect(); + let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect(); + + // Resolve key column indices in both frames. + let mut key_indices_a = Vec::new(); + let mut key_indices_b = Vec::new(); + for key_col in &opts.key_columns { + match headers_a.iter().position(|h| h == key_col) { + Some(idx) => key_indices_a.push(idx), + None => bail!("Key column '{}' not found in {}", key_col, source_a.file_name), + } + match headers_b.iter().position(|h| h == key_col) { + Some(idx) => key_indices_b.push(idx), + None => bail!("Key column '{}' not found in {}", key_col, source_b.file_name), + } + } + + // Find non-key columns. + let non_key_a: Vec<String> = headers_a + .iter() + .filter(|h| !opts.key_columns.contains(h)) + .cloned() + .collect(); + let non_key_b: Vec<String> = headers_b + .iter() + .filter(|h| !opts.key_columns.contains(h)) + .cloned() + .collect(); + + // Common non-key columns (for modification detection). + let common_columns: Vec<String> = non_key_a + .iter() + .filter(|h| non_key_b.contains(h)) + .cloned() + .collect(); + + // Warn about columns only in one file. + for col in &non_key_a { + if !non_key_b.contains(col) { + eprintln!("Warning: column '{}' only in {}", col, source_a.file_name); + } + } + for col in &non_key_b { + if !non_key_a.contains(col) { + eprintln!("Warning: column '{}' only in {}", col, source_b.file_name); + } + } + + // Build output headers: key columns + all from A non-key + B-only non-key. + let mut headers = opts.key_columns.clone(); + headers.extend(non_key_a.iter().cloned()); + for col in &non_key_b { + if !non_key_a.contains(col) { + headers.push(col.clone()); + } + } + + // Check for duplicate keys. + check_duplicate_keys(df_a, &key_indices_a, columns_a, &source_a); + check_duplicate_keys(df_b, &key_indices_b, columns_b, &source_b); + + // Build key maps. + let map_a = build_key_map(df_a, &key_indices_a, columns_a); + let map_b = build_key_map(df_b, &key_indices_b, columns_b); + + let mut removed = Vec::new(); + let mut added = Vec::new(); + let mut modified = Vec::new(); + + // Keys in A but not in B → removed. + for (composite_key, row_a) in &map_a { + if !map_b.contains_key(composite_key) { + let mut vals = Vec::new(); + for h in &headers { + if let Some(idx) = headers_a.iter().position(|ha| ha == h) { + vals.push(row_a.values.get(idx).cloned().unwrap_or_default()); + } else { + vals.push(String::new()); + } + } + removed.push(DiffRow { values: vals }); + } + } + + // Keys in B but not in A → added. + for (composite_key, row_b) in &map_b { + if !map_a.contains_key(composite_key) { + let mut vals = Vec::new(); + for h in &headers { + if let Some(idx) = headers_b.iter().position(|hb| hb == h) { + vals.push(row_b.values.get(idx).cloned().unwrap_or_default()); + } else { + vals.push(String::new()); + } + } + added.push(DiffRow { values: vals }); + } + } + + // Keys in both → compare for modifications. + for (composite_key, row_a) in &map_a { + if let Some(row_b) = map_b.get(composite_key) { + let changes = compare_rows( + df_a, + df_b, + &headers_a, + &headers_b, + row_a, + row_b, + &common_columns, + opts, + ); + if !changes.is_empty() { + modified.push(ModifiedRow { + key: row_a.key_values.clone(), + changes, + }); + } + } + } + + Ok(DiffResult { + headers, + key_columns: opts.key_columns.clone(), + added, + removed, + modified, + source_a, + source_b, + }) +} + +// --------------------------------------------------------------------------- +// Entry point +// --------------------------------------------------------------------------- + +/// Compare two DataFrames, dispatching to positional or key-based diff +/// depending on whether key columns are specified. +pub fn diff_sheets( + df_a: &DataFrame, + df_b: &DataFrame, + opts: &DiffOptions, + source_a: SheetSource, + source_b: SheetSource, +) -> Result<DiffResult> { + if opts.key_columns.is_empty() { + diff_positional(df_a, df_b, opts, source_a, source_b) + } else { + diff_keyed(df_a, df_b, opts, source_a, source_b) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_source_a() -> SheetSource { + SheetSource { + file_name: "a.xlsx".into(), + sheet_name: "Sheet1".into(), + } + } + + fn test_source_b() -> SheetSource { + SheetSource { + file_name: "b.xlsx".into(), + sheet_name: "Sheet1".into(), + } + } + + // ---- Positional diff tests ---- + + #[test] + fn test_positional_no_diff() { + let df_a = df! { + "name" => &["Alice", "Bob"], + "score" => &[100, 200], + } + .unwrap(); + let df_b = df_a.clone(); + let opts = DiffOptions::default(); + + let result = diff_positional(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert!(!result.has_differences()); + assert!(result.added.is_empty()); + assert!(result.removed.is_empty()); + assert!(result.modified.is_empty()); + } + + #[test] + fn test_positional_added_removed() { + let df_a = df! { + "name" => &["Alice", "Bob"], + } + .unwrap(); + let df_b = df! { + "name" => &["Alice", "Charlie"], + } + .unwrap(); + let opts = DiffOptions::default(); + + let result = diff_positional(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert!(result.has_differences()); + assert_eq!(result.removed.len(), 1); + assert_eq!(result.removed[0].values, vec!["Bob"]); + assert_eq!(result.added.len(), 1); + assert_eq!(result.added[0].values, vec!["Charlie"]); + } + + #[test] + fn test_positional_duplicate_rows() { + let df_a = df! { + "val" => &["A", "A", "A"], + } + .unwrap(); + let df_b = df! { + "val" => &["A", "A"], + } + .unwrap(); + let opts = DiffOptions::default(); + + let result = diff_positional(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert_eq!(result.removed.len(), 1); + assert_eq!(result.removed[0].values, vec!["A"]); + assert!(result.added.is_empty()); + } + + // ---- Key-based diff tests ---- + + #[test] + fn test_keyed_no_diff() { + let df_a = df! { + "id" => &[1, 2], + "name" => &["Alice", "Bob"], + } + .unwrap(); + let df_b = df_a.clone(); + let opts = DiffOptions { + key_columns: vec!["id".into()], + tolerance: None, + }; + + let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert!(!result.has_differences()); + } + + #[test] + fn test_keyed_added_removed() { + let df_a = df! { + "id" => &[1, 2], + "name" => &["Alice", "Bob"], + } + .unwrap(); + let df_b = df! { + "id" => &[2, 3], + "name" => &["Bob", "Charlie"], + } + .unwrap(); + let opts = DiffOptions { + key_columns: vec!["id".into()], + tolerance: None, + }; + + let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert_eq!(result.removed.len(), 1); + assert!(result.removed[0].values.contains(&"1".to_string())); + assert!(result.removed[0].values.contains(&"Alice".to_string())); + + assert_eq!(result.added.len(), 1); + assert!(result.added[0].values.contains(&"3".to_string())); + assert!(result.added[0].values.contains(&"Charlie".to_string())); + } + + #[test] + fn test_keyed_modified() { + let df_a = df! { + "id" => &[1, 2], + "score" => &[100, 200], + } + .unwrap(); + let df_b = df! { + "id" => &[1, 2], + "score" => &[100, 250], + } + .unwrap(); + let opts = DiffOptions { + key_columns: vec!["id".into()], + tolerance: None, + }; + + let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert!(result.added.is_empty()); + assert!(result.removed.is_empty()); + assert_eq!(result.modified.len(), 1); + + let m = &result.modified[0]; + assert_eq!(m.key, vec!["2"]); + assert_eq!(m.changes.len(), 1); + assert_eq!(m.changes[0].column, "score"); + assert_eq!(m.changes[0].old_value, "200"); + assert_eq!(m.changes[0].new_value, "250"); + } + + #[test] + fn test_keyed_composite_key() { + let df_a = df! { + "date" => &["2024-01-01", "2024-01-01", "2024-01-02"], + "ticker" => &["AAPL", "GOOG", "AAPL"], + "price" => &[150.0, 140.0, 151.0], + } + .unwrap(); + let df_b = df! { + "date" => &["2024-01-01", "2024-01-01", "2024-01-02"], + "ticker" => &["AAPL", "GOOG", "AAPL"], + "price" => &[150.0, 142.0, 151.0], + } + .unwrap(); + let opts = DiffOptions { + key_columns: vec!["date".into(), "ticker".into()], + tolerance: None, + }; + + let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert!(result.added.is_empty()); + assert!(result.removed.is_empty()); + assert_eq!(result.modified.len(), 1); + + let m = &result.modified[0]; + assert_eq!(m.key, vec!["2024-01-01", "GOOG"]); + assert_eq!(m.changes[0].column, "price"); + assert_eq!(m.changes[0].old_value, "140"); + assert_eq!(m.changes[0].new_value, "142"); + } + + // ---- Tolerance tests ---- + + #[test] + fn test_keyed_tolerance_within() { + let df_a = df! { + "id" => &[1], + "price" => &[100.001_f64], + } + .unwrap(); + let df_b = df! { + "id" => &[1], + "price" => &[100.002_f64], + } + .unwrap(); + let opts = DiffOptions { + key_columns: vec!["id".into()], + tolerance: Some(0.01), + }; + + let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert!(!result.has_differences()); + } + + #[test] + fn test_keyed_tolerance_exceeded() { + let df_a = df! { + "id" => &[1], + "price" => &[100.0_f64], + } + .unwrap(); + let df_b = df! { + "id" => &[1], + "price" => &[100.05_f64], + } + .unwrap(); + let opts = DiffOptions { + key_columns: vec!["id".into()], + tolerance: Some(0.01), + }; + + let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert_eq!(result.modified.len(), 1); + assert_eq!(result.modified[0].changes[0].column, "price"); + } + + #[test] + fn test_keyed_nan_handling() { + let df_a = df! { + "id" => &[1], + "value" => &[f64::NAN], + } + .unwrap(); + let df_b = df! { + "id" => &[1], + "value" => &[f64::NAN], + } + .unwrap(); + let opts = DiffOptions { + key_columns: vec!["id".into()], + tolerance: Some(0.01), + }; + + let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert!(!result.has_differences(), "NaN vs NaN should be treated as equal"); + } + + // ---- diff_sheets entry point tests ---- + + #[test] + fn test_diff_sheets_positional() { + let df_a = df! { + "name" => &["Alice", "Bob"], + } + .unwrap(); + let df_b = df! { + "name" => &["Alice", "Charlie"], + } + .unwrap(); + let opts = DiffOptions::default(); // No key columns → positional. + + let result = diff_sheets(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert!(result.key_columns.is_empty()); + assert_eq!(result.removed.len(), 1); + assert_eq!(result.added.len(), 1); + } + + #[test] + fn test_diff_sheets_keyed() { + let df_a = df! { + "id" => &[1, 2], + "score" => &[100, 200], + } + .unwrap(); + let df_b = df! { + "id" => &[1, 2], + "score" => &[100, 250], + } + .unwrap(); + let opts = DiffOptions { + key_columns: vec!["id".into()], + tolerance: None, + }; + + let result = diff_sheets(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); + + assert_eq!(result.key_columns, vec!["id"]); + assert_eq!(result.modified.len(), 1); + } +} diff --git a/src/filter.rs b/src/filter.rs @@ -0,0 +1,736 @@ +use anyhow::Result; +use polars::prelude::*; + +#[derive(Debug, Clone, PartialEq)] +pub enum FilterOp { + Eq, + NotEq, + Gt, + Lt, + Gte, + Lte, + Contains, + NotContains, +} + +#[derive(Debug, Clone)] +pub struct FilterExpr { + pub column: String, + pub op: FilterOp, + pub value: String, +} + +#[derive(Debug, Clone)] +pub struct SortSpec { + pub column: String, + pub descending: bool, +} + +/// Parse a filter expression like "State=CA", "Amount>1000", "Name~john". +/// Scans left-to-right for the first operator character (= ! > < ~), +/// then determines the full operator. +pub fn parse_filter_expr(s: &str) -> Result<FilterExpr, String> { + let op_chars = ['=', '!', '>', '<', '~']; + let pos = s + .find(|c: char| op_chars.contains(&c)) + .ok_or_else(|| { + format!( + "no operator found in '{}'. Use =, !=, >, <, >=, <=, ~ or !~", + s + ) + })?; + if pos == 0 { + return Err(format!("missing column name in '{}'", s)); + } + let column = s[..pos].to_string(); + let rest = &s[pos..]; + let (op, op_len) = if rest.starts_with(">=") { + (FilterOp::Gte, 2) + } else if rest.starts_with("<=") { + (FilterOp::Lte, 2) + } else if rest.starts_with("!=") { + (FilterOp::NotEq, 2) + } else if rest.starts_with("!~") { + (FilterOp::NotContains, 2) + } else if rest.starts_with('>') { + (FilterOp::Gt, 1) + } else if rest.starts_with('<') { + (FilterOp::Lt, 1) + } else if rest.starts_with('=') { + (FilterOp::Eq, 1) + } else if rest.starts_with('~') { + (FilterOp::Contains, 1) + } else { + return Err(format!("invalid operator in '{}'", s)); + }; + let value = rest[op_len..].to_string(); + Ok(FilterExpr { column, op, value }) +} + +/// Parse a sort spec like "Amount:desc" or "Name" (default asc). +/// Splits on the last colon so column names containing colons are supported. +pub fn parse_sort_spec(s: &str) -> Result<SortSpec, String> { + if let Some(colon_pos) = s.rfind(':') { + let col = &s[..colon_pos]; + let dir = &s[colon_pos + 1..]; + match dir.to_lowercase().as_str() { + "asc" => Ok(SortSpec { + column: col.to_string(), + descending: false, + }), + "desc" => Ok(SortSpec { + column: col.to_string(), + descending: true, + }), + _ => Err(format!( + "invalid sort direction '{}'. Use 'asc' or 'desc'", + dir + )), + } + } else { + Ok(SortSpec { + column: s.to_string(), + descending: false, + }) + } +} + +/// Convert a column letter like "A", "B", "AA" to a 0-based index. +/// Returns None if the string isn't purely alphabetic or is empty. +fn col_letter_to_index(s: &str) -> Option<usize> { + if s.is_empty() || !s.chars().all(|c| c.is_ascii_alphabetic()) { + return None; + } + let mut idx: usize = 0; + for c in s.to_uppercase().chars() { + idx = idx * 26 + (c as usize - 'A' as usize + 1); + } + Some(idx - 1) +} + +/// Resolve a column specifier to a DataFrame column name. +/// Accepts either: +/// - A header name (exact match first, then case-insensitive) +/// - A column letter like "A", "B", "AA" (mapped by position) +/// Header name match takes priority over column letter interpretation. +pub fn resolve_column(spec: &str, df_columns: &[String]) -> Result<String, String> { + // 1. Exact header name match + if df_columns.contains(&spec.to_string()) { + return Ok(spec.to_string()); + } + // 2. Case-insensitive header name match + let spec_lower = spec.to_lowercase(); + for col in df_columns { + if col.to_lowercase() == spec_lower { + return Ok(col.clone()); + } + } + // 3. Column letter (A=0, B=1, ...) — only if purely alphabetic + if let Some(idx) = col_letter_to_index(spec) { + if idx < df_columns.len() { + return Ok(df_columns[idx].clone()); + } + } + let available = df_columns.join(", "); + Err(format!("column '{}' not found. Available columns: {}", spec, available)) +} + +/// Resolve a list of column specifiers to DataFrame column names. +pub fn resolve_columns(specs: &[String], df_columns: &[String]) -> Result<Vec<String>, String> { + specs.iter().map(|s| resolve_column(s, df_columns)).collect() +} + +/// Check if a polars DataType is numeric. +fn is_numeric_dtype(dtype: &DataType) -> bool { + matches!( + dtype, + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 + ) +} + +/// Build a boolean mask for a single filter expression against a DataFrame. +fn build_filter_mask(df: &DataFrame, expr: &FilterExpr) -> Result<BooleanChunked> { + let col = df.column(&expr.column).map_err(|e| anyhow::anyhow!("{}", e))?; + let series = col.as_materialized_series(); + let dtype = series.dtype(); + + match &expr.op { + FilterOp::Eq => { + if is_numeric_dtype(dtype) { + if let Ok(n) = expr.value.parse::<f64>() { + let s = series.cast(&DataType::Float64)?; + return Ok(s.f64()?.equal(n)); + } + } + let s = series.cast(&DataType::String)?; + Ok(s.str()?.equal(expr.value.as_str())) + } + FilterOp::NotEq => { + if is_numeric_dtype(dtype) { + if let Ok(n) = expr.value.parse::<f64>() { + let s = series.cast(&DataType::Float64)?; + return Ok(s.f64()?.not_equal(n)); + } + } + let s = series.cast(&DataType::String)?; + Ok(s.str()?.not_equal(expr.value.as_str())) + } + FilterOp::Gt => { + let n = parse_numeric_value(&expr.value, ">")?; + let s = series.cast(&DataType::Float64)?; + Ok(s.f64()?.gt(n)) + } + FilterOp::Lt => { + let n = parse_numeric_value(&expr.value, "<")?; + let s = series.cast(&DataType::Float64)?; + Ok(s.f64()?.lt(n)) + } + FilterOp::Gte => { + let n = parse_numeric_value(&expr.value, ">=")?; + let s = series.cast(&DataType::Float64)?; + Ok(s.f64()?.gt_eq(n)) + } + FilterOp::Lte => { + let n = parse_numeric_value(&expr.value, "<=")?; + let s = series.cast(&DataType::Float64)?; + Ok(s.f64()?.lt_eq(n)) + } + FilterOp::Contains => { + let s = series.cast(&DataType::String)?; + let ca = s.str()?; + let pat = expr.value.to_lowercase(); + let mask: BooleanChunked = ca.into_iter() + .map(|opt_s| opt_s.map(|s| s.to_lowercase().contains(&pat)).unwrap_or(false)) + .collect(); + Ok(mask) + } + FilterOp::NotContains => { + let s = series.cast(&DataType::String)?; + let ca = s.str()?; + let pat = expr.value.to_lowercase(); + let mask: BooleanChunked = ca.into_iter() + .map(|opt_s| opt_s.map(|s| !s.to_lowercase().contains(&pat)).unwrap_or(true)) + .collect(); + Ok(mask) + } + } +} + +fn parse_numeric_value(value: &str, op: &str) -> Result<f64> { + value + .parse::<f64>() + .map_err(|_| anyhow::anyhow!("'{}' requires numeric value, got '{}'", op, value)) +} + +/// Apply a list of filter expressions to a DataFrame (AND logic). +/// An empty list returns the DataFrame unchanged. +pub fn apply_filters(df: &DataFrame, exprs: &[FilterExpr]) -> Result<DataFrame> { + let mut result = df.clone(); + for expr in exprs { + let mask = build_filter_mask(&result, expr)?; + result = result.filter(&mask)?; + } + Ok(result) +} + +/// Options for the filter pipeline. +pub struct FilterOptions { + pub filters: Vec<FilterExpr>, + pub cols: Option<Vec<String>>, + pub sort: Option<SortSpec>, + pub limit: Option<usize>, + pub head: Option<usize>, + pub tail: Option<usize>, +} + +/// Apply a sort specification to a DataFrame. +pub fn apply_sort(df: &DataFrame, spec: &SortSpec) -> Result<DataFrame> { + let opts = SortMultipleOptions::default() + .with_order_descending(spec.descending); + Ok(df.sort([&spec.column], opts)?) +} + +/// Run the full filter pipeline: head/tail → resolve & filter → sort → limit → select columns. +pub fn filter_pipeline(df: DataFrame, opts: &FilterOptions) -> Result<DataFrame> { + let df_columns: Vec<String> = df + .get_column_names() + .iter() + .map(|s| s.to_string()) + .collect(); + + // 1. Pre-filter window: head or tail + let df = if let Some(n) = opts.head { + df.head(Some(n)) + } else if let Some(n) = opts.tail { + df.tail(Some(n)) + } else { + df + }; + + // 2. Resolve column names in filter expressions and apply filters + let resolved_filters: Vec<FilterExpr> = opts + .filters + .iter() + .map(|f| { + let resolved_col = resolve_column(&f.column, &df_columns)?; + Ok(FilterExpr { + column: resolved_col, + op: f.op.clone(), + value: f.value.clone(), + }) + }) + .collect::<Result<Vec<_>, String>>() + .map_err(|e| anyhow::anyhow!("{}", e))?; + + let df = apply_filters(&df, &resolved_filters)?; + + // 3. Sort + let df = if let Some(ref spec) = opts.sort { + let resolved_col = resolve_column(&spec.column, &df_columns) + .map_err(|e| anyhow::anyhow!("{}", e))?; + let resolved_spec = SortSpec { + column: resolved_col, + descending: spec.descending, + }; + apply_sort(&df, &resolved_spec)? + } else { + df + }; + + // 4. Limit (after filtering and sorting) + let df = if let Some(n) = opts.limit { + df.head(Some(n)) + } else { + df + }; + + // 5. Select columns + let df = if let Some(ref col_specs) = opts.cols { + let resolved_cols = resolve_columns(col_specs, &df_columns) + .map_err(|e| anyhow::anyhow!("{}", e))?; + let col_refs: Vec<&str> = resolved_cols.iter().map(|s| s.as_str()).collect(); + df.select(col_refs)? + } else { + df + }; + + Ok(df) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_test_df() -> DataFrame { + DataFrame::new(vec![ + Column::new("State".into(), &["CA", "NY", "CA", "TX", "NY"]), + Column::new("City".into(), &["LA", "NYC", "SF", "Houston", "Albany"]), + Column::new("Amount".into(), &[1500i64, 2000, 800, 1200, 500]), + Column::new("Year".into(), &[2023i64, 2023, 2024, 2024, 2023]), + Column::new("Status".into(), &["Active", "Active", "Draft", "Active", "Draft"]), + ]) + .unwrap() + } + + #[test] + fn filter_eq_string() { + let df = make_test_df(); + let expr = parse_filter_expr("State=CA").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 2); + } + + #[test] + fn filter_eq_numeric() { + let df = make_test_df(); + let expr = parse_filter_expr("Amount=1500").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 1); + } + + #[test] + fn filter_not_eq() { + let df = make_test_df(); + let expr = parse_filter_expr("Status!=Draft").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 3); + } + + #[test] + fn filter_gt() { + let df = make_test_df(); + let expr = parse_filter_expr("Amount>1000").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 3); + } + + #[test] + fn filter_lt() { + let df = make_test_df(); + let expr = parse_filter_expr("Amount<1000").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 2); + } + + #[test] + fn filter_gte() { + let df = make_test_df(); + let expr = parse_filter_expr("Amount>=1500").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 2); + } + + #[test] + fn filter_lte() { + let df = make_test_df(); + let expr = parse_filter_expr("Amount<=800").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 2); + } + + #[test] + fn filter_contains() { + let df = make_test_df(); + let expr = parse_filter_expr("City~ou").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 1); + } + + #[test] + fn filter_contains_case_insensitive() { + let df = make_test_df(); + let expr = parse_filter_expr("City~HOUSTON").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 1); + } + + #[test] + fn filter_not_contains() { + let df = make_test_df(); + let expr = parse_filter_expr("Status!~raft").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 3); + } + + #[test] + fn filter_multiple_and() { + let df = make_test_df(); + let e1 = parse_filter_expr("State=CA").unwrap(); + let e2 = parse_filter_expr("Amount>1000").unwrap(); + let result = apply_filters(&df, &[e1, e2]).unwrap(); + assert_eq!(result.height(), 1); + } + + #[test] + fn filter_no_matches_returns_empty() { + let df = make_test_df(); + let expr = parse_filter_expr("State=ZZ").unwrap(); + let result = apply_filters(&df, &[expr]).unwrap(); + assert_eq!(result.height(), 0); + } + + #[test] + fn filter_empty_exprs_returns_all() { + let df = make_test_df(); + let result = apply_filters(&df, &[]).unwrap(); + assert_eq!(result.height(), 5); + } + + #[test] + fn parse_eq() { + let expr = parse_filter_expr("State=CA").unwrap(); + assert_eq!(expr.column, "State"); + assert_eq!(expr.op, FilterOp::Eq); + assert_eq!(expr.value, "CA"); + } + + #[test] + fn parse_not_eq() { + let expr = parse_filter_expr("Status!=Draft").unwrap(); + assert_eq!(expr.column, "Status"); + assert_eq!(expr.op, FilterOp::NotEq); + assert_eq!(expr.value, "Draft"); + } + + #[test] + fn parse_gt() { + let expr = parse_filter_expr("Amount>1000").unwrap(); + assert_eq!(expr.column, "Amount"); + assert_eq!(expr.op, FilterOp::Gt); + assert_eq!(expr.value, "1000"); + } + + #[test] + fn parse_lt() { + let expr = parse_filter_expr("Year<2024").unwrap(); + assert_eq!(expr.column, "Year"); + assert_eq!(expr.op, FilterOp::Lt); + assert_eq!(expr.value, "2024"); + } + + #[test] + fn parse_gte() { + let expr = parse_filter_expr("Score>=90").unwrap(); + assert_eq!(expr.column, "Score"); + assert_eq!(expr.op, FilterOp::Gte); + assert_eq!(expr.value, "90"); + } + + #[test] + fn parse_lte() { + let expr = parse_filter_expr("Price<=50.5").unwrap(); + assert_eq!(expr.column, "Price"); + assert_eq!(expr.op, FilterOp::Lte); + assert_eq!(expr.value, "50.5"); + } + + #[test] + fn parse_contains() { + let expr = parse_filter_expr("Name~john").unwrap(); + assert_eq!(expr.column, "Name"); + assert_eq!(expr.op, FilterOp::Contains); + assert_eq!(expr.value, "john"); + } + + #[test] + fn parse_not_contains() { + let expr = parse_filter_expr("Name!~draft").unwrap(); + assert_eq!(expr.column, "Name"); + assert_eq!(expr.op, FilterOp::NotContains); + assert_eq!(expr.value, "draft"); + } + + #[test] + fn parse_value_with_equals() { + let expr = parse_filter_expr("Formula=A+B=C").unwrap(); + assert_eq!(expr.column, "Formula"); + assert_eq!(expr.op, FilterOp::Eq); + assert_eq!(expr.value, "A+B=C"); + } + + #[test] + fn parse_empty_value() { + let expr = parse_filter_expr("Status=").unwrap(); + assert_eq!(expr.column, "Status"); + assert_eq!(expr.op, FilterOp::Eq); + assert_eq!(expr.value, ""); + } + + #[test] + fn parse_no_operator_is_err() { + assert!(parse_filter_expr("JustAWord").is_err()); + } + + #[test] + fn parse_no_column_is_err() { + assert!(parse_filter_expr("=value").is_err()); + } + + #[test] + fn parse_sort_desc() { + let spec = parse_sort_spec("Amount:desc").unwrap(); + assert_eq!(spec.column, "Amount"); + assert!(spec.descending); + } + + #[test] + fn parse_sort_asc() { + let spec = parse_sort_spec("Name:asc").unwrap(); + assert_eq!(spec.column, "Name"); + assert!(!spec.descending); + } + + #[test] + fn parse_sort_default_asc() { + let spec = parse_sort_spec("Name").unwrap(); + assert_eq!(spec.column, "Name"); + assert!(!spec.descending); + } + + #[test] + fn parse_sort_bad_dir_is_err() { + assert!(parse_sort_spec("Name:up").is_err()); + } + + #[test] + fn resolve_by_header_name() { + let cols = vec!["State".to_string(), "Amount".to_string(), "Year".to_string()]; + assert_eq!(resolve_column("Amount", &cols).unwrap(), "Amount"); + } + + #[test] + fn resolve_by_letter() { + let cols = vec!["State".to_string(), "Amount".to_string(), "Year".to_string()]; + assert_eq!(resolve_column("B", &cols).unwrap(), "Amount"); + } + + #[test] + fn resolve_by_letter_lowercase() { + let cols = vec!["State".to_string(), "Amount".to_string(), "Year".to_string()]; + assert_eq!(resolve_column("b", &cols).unwrap(), "Amount"); + } + + #[test] + fn resolve_header_takes_priority_over_letter() { + let cols = vec!["A".to_string(), "B".to_string()]; + assert_eq!(resolve_column("A", &cols).unwrap(), "A"); + } + + #[test] + fn resolve_case_insensitive_header() { + let cols = vec!["State".to_string(), "Amount".to_string()]; + assert_eq!(resolve_column("state", &cols).unwrap(), "State"); + } + + #[test] + fn resolve_unknown_column_is_err() { + let cols = vec!["State".to_string(), "Amount".to_string()]; + let err = resolve_column("Foo", &cols).unwrap_err(); + assert!(err.contains("not found"), "error was: {}", err); + } + + #[test] + fn resolve_letter_out_of_range_is_err() { + let cols = vec!["State".to_string()]; + let err = resolve_column("C", &cols).unwrap_err(); + assert!(err.contains("not found"), "error was: {}", err); + } + + #[test] + fn resolve_multiple_columns() { + let cols = vec!["State".to_string(), "Amount".to_string(), "Year".to_string()]; + let resolved = resolve_columns(&["A".to_string(), "Year".to_string()], &cols).unwrap(); + assert_eq!(resolved, vec!["State", "Year"]); + } + + #[test] + fn sort_ascending() { + let df = make_test_df(); + let spec = parse_sort_spec("Amount:asc").unwrap(); + let result = apply_sort(&df, &spec).unwrap(); + let col = result.column("Amount").unwrap().as_materialized_series(); + let amounts = col.i64().unwrap(); + assert_eq!(amounts.get(0), Some(500)); + assert_eq!(amounts.get(4), Some(2000)); + } + + #[test] + fn sort_descending() { + let df = make_test_df(); + let spec = parse_sort_spec("Amount:desc").unwrap(); + let result = apply_sort(&df, &spec).unwrap(); + let col = result.column("Amount").unwrap().as_materialized_series(); + let amounts = col.i64().unwrap(); + assert_eq!(amounts.get(0), Some(2000)); + assert_eq!(amounts.get(4), Some(500)); + } + + #[test] + fn pipeline_full() { + let df = make_test_df(); + let opts = FilterOptions { + filters: vec![parse_filter_expr("Amount>500").unwrap()], + cols: Some(vec!["State".to_string(), "Amount".to_string()]), + sort: Some(parse_sort_spec("Amount:desc").unwrap()), + limit: Some(2), + head: None, + tail: None, + }; + let result = filter_pipeline(df, &opts).unwrap(); + assert_eq!(result.height(), 2); + assert_eq!(result.width(), 2); + let col = result.column("Amount").unwrap().as_materialized_series(); + let amounts = col.i64().unwrap(); + assert_eq!(amounts.get(0), Some(2000)); + assert_eq!(amounts.get(1), Some(1500)); + } + + #[test] + fn pipeline_head_before_filter() { + let df = make_test_df(); // 5 rows: CA/LA, NY/NYC, CA/SF, TX/Houston, NY/Albany + let opts = FilterOptions { + filters: vec![parse_filter_expr("State=NY").unwrap()], + cols: None, + sort: None, + limit: None, + head: Some(3), // Take first 3 rows before filtering + tail: None, + }; + let result = filter_pipeline(df, &opts).unwrap(); + // First 3 rows: CA/LA, NY/NYC, CA/SF → only NY/NYC matches + assert_eq!(result.height(), 1); + } + + #[test] + fn pipeline_tail_before_filter() { + let df = make_test_df(); // 5 rows + let opts = FilterOptions { + filters: vec![parse_filter_expr("State=CA").unwrap()], + cols: None, + sort: None, + limit: None, + head: None, + tail: Some(3), // Last 3 rows before filtering + }; + let result = filter_pipeline(df, &opts).unwrap(); + // Last 3 rows: CA/SF, TX/Houston, NY/Albany → only CA/SF matches + assert_eq!(result.height(), 1); + } + + #[test] + fn pipeline_no_options_returns_all() { + let df = make_test_df(); + let opts = FilterOptions { + filters: vec![], + cols: None, + sort: None, + limit: None, + head: None, + tail: None, + }; + let result = filter_pipeline(df, &opts).unwrap(); + assert_eq!(result.height(), 5); + assert_eq!(result.width(), 5); + } + + #[test] + fn pipeline_cols_by_letter() { + let df = make_test_df(); + let opts = FilterOptions { + filters: vec![], + cols: Some(vec!["A".to_string(), "C".to_string()]), + sort: None, + limit: None, + head: None, + tail: None, + }; + let result = filter_pipeline(df, &opts).unwrap(); + assert_eq!(result.width(), 2); + let names: Vec<String> = result.get_column_names().iter().map(|s| s.to_string()).collect(); + assert_eq!(names, vec!["State", "Amount"]); + } + + #[test] + fn pipeline_limit_after_filter() { + let df = make_test_df(); + let opts = FilterOptions { + filters: vec![parse_filter_expr("Status=Active").unwrap()], + cols: None, + sort: None, + limit: Some(2), + head: None, + tail: None, + }; + let result = filter_pipeline(df, &opts).unwrap(); + assert_eq!(result.height(), 2); // 3 Active rows, limited to 2 + } +} diff --git a/src/formatter.rs b/src/formatter.rs @@ -237,7 +237,7 @@ fn df_to_strings(df: &DataFrame) -> (Vec<String>, Vec<Vec<String>>) { } /// Compute the display width for each column. -fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> { +pub fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> { let mut widths: Vec<usize> = headers.iter().map(|h| h.len().max(3)).collect(); for row in rows { for (i, cell) in row.iter().enumerate() { @@ -250,7 +250,7 @@ fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> { } /// Render a markdown table header + separator line. -fn render_table_header(headers: &[String], widths: &[usize]) -> String { +pub fn render_table_header(headers: &[String], widths: &[usize]) -> String { let mut out = String::new(); out.push('|'); for (i, h) in headers.iter().enumerate() { @@ -270,7 +270,7 @@ fn render_table_header(headers: &[String], widths: &[usize]) -> String { } /// Render markdown table data rows (no header). -fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String { +pub fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String { let mut out = String::new(); for row in rows { out.push('|'); @@ -284,7 +284,7 @@ fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String { } /// Render a complete aligned markdown table. -fn render_table(headers: &[String], rows: &[Vec<String>]) -> String { +pub fn render_table(headers: &[String], rows: &[Vec<String>]) -> String { let widths = compute_col_widths(headers, rows); let mut out = render_table_header(headers, &widths); out.push_str(&render_table_rows(rows, &widths)); @@ -300,7 +300,7 @@ fn format_cell(col: &Column, idx: usize) -> String { } /// Convert an AnyValue to its display string. -fn format_any_value(v: &AnyValue) -> String { +pub fn format_any_value(v: &AnyValue) -> String { match v { AnyValue::Null => String::new(), AnyValue::Boolean(b) => b.to_string(), diff --git a/src/lib.rs b/src/lib.rs @@ -1,4 +1,6 @@ pub mod cell; +pub mod diff; +pub mod filter; pub mod formatter; pub mod metadata; pub mod reader; diff --git a/src/reader.rs b/src/reader.rs @@ -23,14 +23,33 @@ pub fn read_sheet(path: &Path, sheet_name: &str) -> Result<DataFrame> { } pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> { - let (total_rows, cols) = range.get_size(); - if total_rows == 0 || cols == 0 { + range_to_dataframe_skip(range, 0) +} + +/// Read a sheet, skipping the first `skip` rows before treating the next row as the header. +pub fn read_sheet_with_skip(path: &Path, sheet_name: &str, skip: usize) -> Result<DataFrame> { + let mut workbook = open_workbook_auto(path) + .with_context(|| format!("Cannot open workbook: {}", path.display()))?; + let range = workbook + .worksheet_range(sheet_name) + .with_context(|| format!("Cannot read sheet: {sheet_name}"))?; + range_to_dataframe_skip(&range, skip) +} + +/// Convert a calamine Range to a DataFrame, skipping `skip` rows before the header. +pub fn range_to_dataframe_skip(range: &calamine::Range<Data>, skip: usize) -> Result<DataFrame> { + let rows: Vec<&[Data]> = range.rows().skip(skip).collect(); + let cols = if rows.is_empty() { + 0 + } else { + rows.iter().map(|r| r.len()).max().unwrap_or(0) + }; + + if rows.is_empty() || cols == 0 { return Ok(DataFrame::default()); } - let rows: Vec<&[Data]> = range.rows().collect(); - - // First row = headers + // First row (after skip) = headers let headers: Vec<String> = rows[0] .iter() .enumerate() @@ -40,7 +59,7 @@ pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> { }) .collect(); - if total_rows == 1 { + if rows.len() == 1 { // Header only, no data let series: Vec<Column> = headers .iter() @@ -374,4 +393,38 @@ mod tests { assert_eq!(df.height(), 0); assert_eq!(df.width(), 0); } + + /// Create xlsx with metadata rows above the real header + fn create_with_metadata_rows(path: &std::path::Path) { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "Report Title").unwrap(); + ws.write_string(1, 0, "Generated 2024-01-01").unwrap(); + ws.write_string(2, 0, "Name").unwrap(); + ws.write_string(2, 1, "Value").unwrap(); + ws.write_string(3, 0, "Alice").unwrap(); + ws.write_number(3, 1, 100.0).unwrap(); + ws.write_string(4, 0, "Bob").unwrap(); + ws.write_number(4, 1, 200.0).unwrap(); + wb.save(path).unwrap(); + } + + #[test] + fn test_read_with_skip() { + let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); + create_with_metadata_rows(tmp.path()); + let df = read_sheet_with_skip(tmp.path(), "Data", 2).unwrap(); + let col_names: Vec<String> = df.get_column_names().iter().map(|s| s.to_string()).collect(); + assert_eq!(col_names, vec!["Name", "Value"]); + assert_eq!(df.height(), 2); + } + + #[test] + fn test_read_with_skip_zero() { + let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); + create_simple(tmp.path()); + let df = read_sheet_with_skip(tmp.path(), "Data", 0).unwrap(); + assert_eq!(df.height(), 5); + assert_eq!(df.width(), 4); + } } diff --git a/tests/common/mod.rs b/tests/common/mod.rs @@ -107,3 +107,196 @@ pub fn create_empty_sheet(path: &Path) { wb.add_worksheet().set_name("Blank").unwrap(); wb.save(path).unwrap(); } + +/// Create a test file with metadata rows above the real header. +pub fn create_with_metadata(path: &Path) { + use rust_xlsxwriter::*; + let mut workbook = Workbook::new(); + let sheet = workbook.add_worksheet().set_name("Data").unwrap(); + + // Metadata rows + sheet.write_string(0, 0, "Quarterly Report").unwrap(); + sheet.write_string(1, 0, "Generated 2024-01-01").unwrap(); + + // Real header at row 2 + sheet.write_string(2, 0, "Name").unwrap(); + sheet.write_string(2, 1, "Value").unwrap(); + + // Data + sheet.write_string(3, 0, "Alice").unwrap(); + sheet.write_number(3, 1, 100.0).unwrap(); + sheet.write_string(4, 0, "Bob").unwrap(); + sheet.write_number(4, 1, 200.0).unwrap(); + + workbook.save(path.to_str().unwrap()).unwrap(); +} + +/// Create a test file with diverse data for filter testing. +/// Sheet "Data" with 6 rows: State, City, Amount, Year, Status +pub fn create_filterable(path: &Path) { + let mut workbook = Workbook::new(); + let sheet = workbook.add_worksheet().set_name("Data").unwrap(); + + // Headers + sheet.write_string(0, 0, "State").unwrap(); + sheet.write_string(0, 1, "City").unwrap(); + sheet.write_string(0, 2, "Amount").unwrap(); + sheet.write_string(0, 3, "Year").unwrap(); + sheet.write_string(0, 4, "Status").unwrap(); + + // Row 1: CA, Los Angeles, 1500, 2023, Active + sheet.write_string(1, 0, "CA").unwrap(); + sheet.write_string(1, 1, "Los Angeles").unwrap(); + sheet.write_number(1, 2, 1500.0).unwrap(); + sheet.write_number(1, 3, 2023.0).unwrap(); + sheet.write_string(1, 4, "Active").unwrap(); + + // Row 2: NY, New York, 2000, 2023, Active + sheet.write_string(2, 0, "NY").unwrap(); + sheet.write_string(2, 1, "New York").unwrap(); + sheet.write_number(2, 2, 2000.0).unwrap(); + sheet.write_number(2, 3, 2023.0).unwrap(); + sheet.write_string(2, 4, "Active").unwrap(); + + // Row 3: CA, San Francisco, 800, 2024, Draft + sheet.write_string(3, 0, "CA").unwrap(); + sheet.write_string(3, 1, "San Francisco").unwrap(); + sheet.write_number(3, 2, 800.0).unwrap(); + sheet.write_number(3, 3, 2024.0).unwrap(); + sheet.write_string(3, 4, "Draft").unwrap(); + + // Row 4: TX, Houston, 1200, 2024, Active + sheet.write_string(4, 0, "TX").unwrap(); + sheet.write_string(4, 1, "Houston").unwrap(); + sheet.write_number(4, 2, 1200.0).unwrap(); + sheet.write_number(4, 3, 2024.0).unwrap(); + sheet.write_string(4, 4, "Active").unwrap(); + + // Row 5: NY, Albany, 500, 2023, Draft + sheet.write_string(5, 0, "NY").unwrap(); + sheet.write_string(5, 1, "Albany").unwrap(); + sheet.write_number(5, 2, 500.0).unwrap(); + sheet.write_number(5, 3, 2023.0).unwrap(); + sheet.write_string(5, 4, "Draft").unwrap(); + + // Row 6: FL, Miami, 3000, 2024, Active + sheet.write_string(6, 0, "FL").unwrap(); + sheet.write_string(6, 1, "Miami").unwrap(); + sheet.write_number(6, 2, 3000.0).unwrap(); + sheet.write_number(6, 3, 2024.0).unwrap(); + sheet.write_string(6, 4, "Active").unwrap(); + + workbook.save(path).unwrap(); +} + +/// Create a pair of files for positional diff testing. +/// File A: Name/Score — Alice/90, Bob/80, Charlie/70 +/// File B: Name/Score — Alice/90, Charlie/70, Dana/85 +/// Expected: Bob removed, Dana added +pub fn create_diff_pair(path_a: &Path, path_b: &Path) { + // File A + { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "Name").unwrap(); + ws.write_string(0, 1, "Score").unwrap(); + ws.write_string(1, 0, "Alice").unwrap(); + ws.write_number(1, 1, 90.0).unwrap(); + ws.write_string(2, 0, "Bob").unwrap(); + ws.write_number(2, 1, 80.0).unwrap(); + ws.write_string(3, 0, "Charlie").unwrap(); + ws.write_number(3, 1, 70.0).unwrap(); + wb.save(path_a).unwrap(); + } + + // File B + { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "Name").unwrap(); + ws.write_string(0, 1, "Score").unwrap(); + ws.write_string(1, 0, "Alice").unwrap(); + ws.write_number(1, 1, 90.0).unwrap(); + ws.write_string(2, 0, "Charlie").unwrap(); + ws.write_number(2, 1, 70.0).unwrap(); + ws.write_string(3, 0, "Dana").unwrap(); + ws.write_number(3, 1, 85.0).unwrap(); + wb.save(path_b).unwrap(); + } +} + +/// Create a pair of files for key-based diff testing. +/// File A: ID/Name/Score — "1"/Alice/90, "2"/Bob/80, "3"/Charlie/70 +/// File B: ID/Name/Score — "1"/Alice/95, "2"/Bob/80, "4"/Dana/85 +/// Expected: ID=1 modified (Score 90→95), ID=3 removed, ID=4 added +pub fn create_diff_pair_with_keys(path_a: &Path, path_b: &Path) { + // File A + { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "ID").unwrap(); + ws.write_string(0, 1, "Name").unwrap(); + ws.write_string(0, 2, "Score").unwrap(); + ws.write_string(1, 0, "1").unwrap(); + ws.write_string(1, 1, "Alice").unwrap(); + ws.write_number(1, 2, 90.0).unwrap(); + ws.write_string(2, 0, "2").unwrap(); + ws.write_string(2, 1, "Bob").unwrap(); + ws.write_number(2, 2, 80.0).unwrap(); + ws.write_string(3, 0, "3").unwrap(); + ws.write_string(3, 1, "Charlie").unwrap(); + ws.write_number(3, 2, 70.0).unwrap(); + wb.save(path_a).unwrap(); + } + + // File B + { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "ID").unwrap(); + ws.write_string(0, 1, "Name").unwrap(); + ws.write_string(0, 2, "Score").unwrap(); + ws.write_string(1, 0, "1").unwrap(); + ws.write_string(1, 1, "Alice").unwrap(); + ws.write_number(1, 2, 95.0).unwrap(); + ws.write_string(2, 0, "2").unwrap(); + ws.write_string(2, 1, "Bob").unwrap(); + ws.write_number(2, 2, 80.0).unwrap(); + ws.write_string(3, 0, "4").unwrap(); + ws.write_string(3, 1, "Dana").unwrap(); + ws.write_number(3, 2, 85.0).unwrap(); + wb.save(path_b).unwrap(); + } +} + +/// Create a pair of files for tolerance testing. +/// File A: ID/Price — "1"/100.001, "2"/200.5 +/// File B: ID/Price — "1"/100.002, "2"/200.6 +/// Expected with tolerance 0.01: only ID=2 modified (diff=0.1 > 0.01) +pub fn create_diff_pair_with_floats(path_a: &Path, path_b: &Path) { + // File A + { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "ID").unwrap(); + ws.write_string(0, 1, "Price").unwrap(); + ws.write_string(1, 0, "1").unwrap(); + ws.write_number(1, 1, 100.001).unwrap(); + ws.write_string(2, 0, "2").unwrap(); + ws.write_number(2, 1, 200.5).unwrap(); + wb.save(path_a).unwrap(); + } + + // File B + { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "ID").unwrap(); + ws.write_string(0, 1, "Price").unwrap(); + ws.write_string(1, 0, "1").unwrap(); + ws.write_number(1, 1, 100.002).unwrap(); + ws.write_string(2, 0, "2").unwrap(); + ws.write_number(2, 1, 200.6).unwrap(); + wb.save(path_b).unwrap(); + } +} diff --git a/tests/test_xldiff.rs b/tests/test_xldiff.rs @@ -0,0 +1,223 @@ +mod common; + +use assert_cmd::Command; +use predicates::prelude::*; +use tempfile::TempDir; + +fn xldiff() -> Command { + Command::cargo_bin("xldiff").unwrap() +} + +// --------------------------------------------------------------------------- +// test_no_diff +// --------------------------------------------------------------------------- + +#[test] +fn test_no_diff() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_simple(&path_a); + common::create_simple(&path_b); + + xldiff() + .arg(&path_a) + .arg(&path_b) + .arg("--no-color") + .assert() + .success() + .stdout(predicate::str::contains("No differences found.")); +} + +// --------------------------------------------------------------------------- +// test_positional_diff +// --------------------------------------------------------------------------- + +#[test] +fn test_positional_diff() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair(&path_a, &path_b); + + xldiff() + .arg(&path_a) + .arg(&path_b) + .arg("--no-color") + .assert() + .code(1) + .stdout(predicate::str::contains("Removed: 1")) + .stdout(predicate::str::contains("Added: 1")) + .stdout(predicate::str::contains("Bob")); +} + +// --------------------------------------------------------------------------- +// test_keyed_diff +// --------------------------------------------------------------------------- + +#[test] +fn test_keyed_diff() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + xldiff() + .arg(&path_a) + .arg(&path_b) + .args(["--key", "ID", "--no-color"]) + .assert() + .code(1) + .stdout(predicate::str::contains("Modified: 1")) + .stdout(predicate::str::contains("Removed: 1")) + .stdout(predicate::str::contains("Added: 1")); +} + +// --------------------------------------------------------------------------- +// test_tolerance +// --------------------------------------------------------------------------- + +#[test] +fn test_tolerance() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_floats(&path_a, &path_b); + + // With tolerance 0.01: ID=1 diff is 0.001 (within tolerance), ID=2 diff is 0.1 (exceeds) + xldiff() + .arg(&path_a) + .arg(&path_b) + .args(["--key", "ID", "--tolerance", "0.01", "--no-color"]) + .assert() + .code(1) + .stdout(predicate::str::contains("Modified: 1")); +} + +// --------------------------------------------------------------------------- +// test_json_format +// --------------------------------------------------------------------------- + +#[test] +fn test_json_format() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + xldiff() + .arg(&path_a) + .arg(&path_b) + .args(["--key", "ID", "--format", "json"]) + .assert() + .code(1) + .stdout(predicate::str::contains("\"added\"")) + .stdout(predicate::str::contains("\"removed\"")) + .stdout(predicate::str::contains("\"modified\"")); +} + +// --------------------------------------------------------------------------- +// test_markdown_format +// --------------------------------------------------------------------------- + +#[test] +fn test_markdown_format() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + xldiff() + .arg(&path_a) + .arg(&path_b) + .args(["--key", "ID", "--format", "markdown"]) + .assert() + .code(1) + .stdout(predicate::str::contains("## Added")) + .stdout(predicate::str::contains("## Removed")) + .stdout(predicate::str::contains("## Modified")); +} + +// --------------------------------------------------------------------------- +// test_csv_format +// --------------------------------------------------------------------------- + +#[test] +fn test_csv_format() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + xldiff() + .arg(&path_a) + .arg(&path_b) + .args(["--key", "ID", "--format", "csv"]) + .assert() + .code(1) + .stdout(predicate::str::contains("_status")) + .stdout(predicate::str::contains("added")) + .stdout(predicate::str::contains("removed")) + .stdout(predicate::str::contains("modified")); +} + +// --------------------------------------------------------------------------- +// test_file_not_found +// --------------------------------------------------------------------------- + +#[test] +fn test_file_not_found() { + xldiff() + .arg("nonexistent_a.xlsx") + .arg("nonexistent_b.xlsx") + .assert() + .code(2) + .stderr(predicate::str::contains("file not found")); +} + +// --------------------------------------------------------------------------- +// test_sheet_selector +// --------------------------------------------------------------------------- + +#[test] +fn test_sheet_selector() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("multi.xlsx"); + common::create_multi_sheet(&path); + + // Revenue and Expenses sheets have different schemas and data + let arg_revenue = format!("{}:Revenue", path.display()); + let arg_expenses = format!("{}:Expenses", path.display()); + + xldiff() + .arg(&arg_revenue) + .arg(&arg_expenses) + .assert() + .code(1); +} + +// --------------------------------------------------------------------------- +// test_cols_filter +// --------------------------------------------------------------------------- + +#[test] +fn test_cols_filter() { + let dir = TempDir::new().unwrap(); + let path_a = dir.path().join("a.xlsx"); + let path_b = dir.path().join("b.xlsx"); + common::create_diff_pair_with_keys(&path_a, &path_b); + + // With --cols ID,Name: Score column is excluded from comparison. + // ID=1: Score changed 90→95, but Score is excluded, so Name is same → no modification. + // ID=3: removed, ID=4: added. + // Modified should be 0. + xldiff() + .arg(&path_a) + .arg(&path_b) + .args(["--key", "ID", "--cols", "ID,Name", "--no-color"]) + .assert() + .code(1) + .stdout(predicate::str::contains("Modified: 0")) + .stdout(predicate::str::contains("Added: 1")) + .stdout(predicate::str::contains("Removed: 1")); +} diff --git a/tests/test_xlfilter.rs b/tests/test_xlfilter.rs @@ -0,0 +1,384 @@ +mod common; + +use assert_cmd::Command; +use predicates::prelude::*; +use tempfile::TempDir; + +fn xlfilter() -> Command { + Command::cargo_bin("xlfilter").unwrap() +} + +fn setup() -> (TempDir, std::path::PathBuf) { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("test.xlsx"); + common::create_filterable(&path); + (dir, path) +} + +// === Basic functionality === + +#[test] +fn no_flags_shows_all_rows() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .assert() + .success() + .stdout(predicate::str::contains("CA")) + .stdout(predicate::str::contains("NY")) + .stdout(predicate::str::contains("TX")) + .stdout(predicate::str::contains("FL")) + .stderr(predicate::str::contains("6 rows")); +} + +#[test] +fn where_eq_string() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "State=CA"]) + .assert() + .success() + .stdout(predicate::str::contains("Los Angeles")) + .stdout(predicate::str::contains("San Francisco")) + .stdout(predicate::str::contains("NY").not()) + .stderr(predicate::str::contains("2 rows")); +} + +#[test] +fn where_gt_numeric() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "Amount>1500"]) + .assert() + .success() + .stdout(predicate::str::contains("New York")) + .stdout(predicate::str::contains("Miami")) + .stderr(predicate::str::contains("2 rows")); +} + +#[test] +fn where_multiple_and() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "State=CA", "--where", "Amount>1000"]) + .assert() + .success() + .stdout(predicate::str::contains("Los Angeles")) + .stdout(predicate::str::contains("San Francisco").not()) + .stderr(predicate::str::contains("1 rows")); +} + +#[test] +fn where_not_eq() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "Status!=Draft"]) + .assert() + .success() + .stderr(predicate::str::contains("4 rows")); +} + +#[test] +fn where_contains() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "City~angel"]) + .assert() + .success() + .stdout(predicate::str::contains("Los Angeles")) + .stderr(predicate::str::contains("1 rows")); +} + +#[test] +fn where_not_contains() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "Status!~raft"]) + .assert() + .success() + .stderr(predicate::str::contains("4 rows")); +} + +#[test] +fn where_no_matches() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "State=ZZ"]) + .assert() + .success() + .stderr(predicate::str::contains("0 rows")); +} + +// === Column selection === + +#[test] +fn cols_by_name() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--cols", "State,Amount"]) + .assert() + .success() + .stdout(predicate::str::contains("State")) + .stdout(predicate::str::contains("Amount")) + .stdout(predicate::str::contains("City").not()) + .stdout(predicate::str::contains("Year").not()); +} + +#[test] +fn cols_by_letter() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--cols", "A,C"]) + .assert() + .success() + .stdout(predicate::str::contains("State")) + .stdout(predicate::str::contains("Amount")) + .stdout(predicate::str::contains("City").not()); +} + +#[test] +fn cols_mixed_letter_and_name() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--cols", "A,Amount"]) + .assert() + .success() + .stdout(predicate::str::contains("State")) + .stdout(predicate::str::contains("Amount")) + .stdout(predicate::str::contains("City").not()); +} + +// === Sort === + +#[test] +fn sort_desc() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--sort", "Amount:desc", "--cols", "City,Amount"]) + .assert() + .success() + .stdout(predicate::str::contains("Miami")); // 3000 = highest +} + +#[test] +fn sort_asc() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--sort", "Amount:asc", "--limit", "1", "--cols", "City,Amount"]) + .assert() + .success() + .stdout(predicate::str::contains("Albany")); // 500 = lowest +} + +#[test] +fn sort_by_column_letter() { + let (_dir, path) = setup(); + // C = Amount column + xlfilter() + .arg(&path) + .args(["--sort", "C:desc", "--limit", "1", "--cols", "City,Amount"]) + .assert() + .success() + .stdout(predicate::str::contains("Miami")); // 3000 = highest +} + +// === Limit, head, tail === + +#[test] +fn limit_caps_output() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--limit", "3"]) + .assert() + .success() + .stderr(predicate::str::contains("3 rows")); +} + +#[test] +fn head_before_filter() { + let (_dir, path) = setup(); + // First 3 rows: CA/LA, NY/NYC, CA/SF + // Filter State=NY → only NYC + xlfilter() + .arg(&path) + .args(["--head", "3", "--where", "State=NY"]) + .assert() + .success() + .stderr(predicate::str::contains("1 rows")); +} + +#[test] +fn tail_before_filter() { + let (_dir, path) = setup(); + // Last 3 rows: TX/Houston, NY/Albany, FL/Miami + // Filter State=NY → only Albany + xlfilter() + .arg(&path) + .args(["--tail", "3", "--where", "State=NY"]) + .assert() + .success() + .stderr(predicate::str::contains("1 rows")); +} + +#[test] +fn head_and_tail_mutually_exclusive() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--head", "3", "--tail", "3"]) + .assert() + .failure() + .stderr(predicate::str::contains("mutually exclusive")); +} + +// === CSV output === + +#[test] +fn csv_output() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--csv", "--where", "State=CA"]) + .assert() + .success() + .stdout(predicate::str::contains(",")) // CSV has commas + .stdout(predicate::str::contains("|").not()); // no markdown pipes +} + +// === Sheet selection === + +#[test] +fn sheet_by_name() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--sheet", "Data"]) + .assert() + .success(); +} + +#[test] +fn sheet_not_found() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--sheet", "NoSuchSheet"]) + .assert() + .failure() + .stderr(predicate::str::contains("not found")); +} + +// === Error cases === + +#[test] +fn file_not_found() { + xlfilter() + .arg("nonexistent.xlsx") + .assert() + .failure() + .stderr(predicate::str::contains("not found")); +} + +#[test] +fn bad_filter_expr() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "NoOperator"]) + .assert() + .failure() + .stderr(predicate::str::contains("no operator found")); +} + +#[test] +fn bad_sort_dir() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--sort", "Amount:up"]) + .assert() + .failure() + .stderr(predicate::str::contains("invalid sort direction")); +} + +#[test] +fn unknown_column_in_where() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "Foo=bar"]) + .assert() + .failure() + .stderr(predicate::str::contains("not found")); +} + +#[test] +fn unknown_column_in_cols() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--cols", "State,Foo"]) + .assert() + .failure() + .stderr(predicate::str::contains("not found")); +} + +#[test] +fn gt_with_non_numeric_value() { + let (_dir, path) = setup(); + xlfilter() + .arg(&path) + .args(["--where", "Amount>abc"]) + .assert() + .failure() + .stderr(predicate::str::contains("numeric value")); +} + +// === Skip rows === + +#[test] +fn skip_metadata_rows() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("meta.xlsx"); + common::create_with_metadata(&path); + + xlfilter() + .arg(&path) + .args(["--skip", "2"]) + .assert() + .success() + .stdout(predicate::str::contains("Name")) + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Bob")) + .stdout(predicate::str::contains("Quarterly Report").not()) + .stderr(predicate::str::contains("2 rows")); +} + +#[test] +fn skip_with_filter() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("meta.xlsx"); + common::create_with_metadata(&path); + + xlfilter() + .arg(&path) + .args(["--skip", "2", "--where", "Name=Alice"]) + .assert() + .success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Bob").not()) + .stderr(predicate::str::contains("1 rows")); +}