commit 01c053c325aa199b67f50d9dbae23a96a43e3196
parent a1ae2c1cd006db764627b32b7c247c4183273b32
Author: Erik Loualiche <[email protected]>
Date: Wed, 18 Mar 2026 09:07:52 -0400
Merge pull request #1 from LouLouLibs/feat/xldiff
feat: add xldiff — Excel sheet comparison tool
Diffstat:
19 files changed, 5771 insertions(+), 22 deletions(-)
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -34,6 +34,7 @@ jobs:
mkdir -p dist
cp target/${{ matrix.target }}/release/xlcat dist/xlcat-${{ matrix.suffix }}
cp target/${{ matrix.target }}/release/xlset dist/xlset-${{ matrix.suffix }}
+ cp target/${{ matrix.target }}/release/xlfilter dist/xlfilter-${{ matrix.suffix }}
- uses: actions/upload-artifact@v4
with:
@@ -57,17 +58,20 @@ jobs:
# ARM Mac
cp artifacts/binaries-aarch64-apple-darwin/xlcat-aarch64-apple-darwin release/
cp artifacts/binaries-aarch64-apple-darwin/xlset-aarch64-apple-darwin release/
+ cp artifacts/binaries-aarch64-apple-darwin/xlfilter-aarch64-apple-darwin release/
# x86 Mac
cp artifacts/binaries-x86_64-apple-darwin/xlcat-x86_64-apple-darwin release/
cp artifacts/binaries-x86_64-apple-darwin/xlset-x86_64-apple-darwin release/
+ cp artifacts/binaries-x86_64-apple-darwin/xlfilter-x86_64-apple-darwin release/
# Demo GIFs
cp demo/xlcat.gif release/
cp demo/xlset.gif release/
+ cp demo/xlfilter.gif release/
# Make binaries executable
- chmod +x release/xlcat-* release/xlset-*
+ chmod +x release/xlcat-* release/xlset-* release/xlfilter-*
- name: Create GitHub Release
env:
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,20 @@
+# xl-cli-tool
+
+Rust CLI tools for working with Excel (.xlsx/.xls) files, designed for LLM-assisted analysis (Claude Code).
+
+## Current tools
+- **xlcat** — view/inspect spreadsheets (metadata, schema, stats, data as markdown tables or CSV)
+- **xlset** — write cells preserving formatting/formulas/structure
+- **xlfilter** — query/filter rows and columns from spreadsheets
+- **xldiff** — compare two sheets, report added/removed/modified rows
+
+## Architecture
+- Shared library in `src/lib.rs` (reader, formatter, metadata, cell, writer, filter, diff modules)
+- Binary entry points in `src/bin/`
+- Dependencies: calamine (read), umya-spreadsheet (write), polars (data), clap (CLI), serde_json (JSON output)
+
+## Conventions
+- Exit codes: 0 = success, 1 = runtime error, 2 = invalid arguments
+ - Exception: xldiff uses diff(1) convention: 0 = no differences, 1 = differences found, 2 = error
+- Output: markdown tables by default, CSV with `--csv`
+- Rust edition 2024
diff --git a/Cargo.lock b/Cargo.lock
@@ -2160,6 +2160,7 @@ version = "1.0.149"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
dependencies = [
+ "indexmap",
"itoa",
"memchr",
"serde",
@@ -2977,6 +2978,7 @@ dependencies = [
"polars",
"predicates",
"rust_xlsxwriter",
+ "serde_json",
"tempfile",
"umya-spreadsheet",
]
diff --git a/Cargo.toml b/Cargo.toml
@@ -21,12 +21,21 @@ path = "src/bin/xlcat.rs"
name = "xlset"
path = "src/bin/xlset.rs"
+[[bin]]
+name = "xlfilter"
+path = "src/bin/xlfilter.rs"
+
+[[bin]]
+name = "xldiff"
+path = "src/bin/xldiff.rs"
+
[dependencies]
calamine = "0.26"
polars = { version = "0.46", features = ["dtype-date", "dtype-datetime", "dtype-duration", "csv"] }
clap = { version = "4", features = ["derive"] }
anyhow = "1"
umya-spreadsheet = "2"
+serde_json = { version = "1", features = ["preserve_order"] }
[profile.release]
strip = true
diff --git a/README.md b/README.md
@@ -8,12 +8,14 @@
<table>
<tr>
-<td align="center" width="50%"><strong>xlcat</strong> — view</td>
-<td align="center" width="50%"><strong>xlset</strong> — edit</td>
+<td align="center" width="33%"><strong>xlcat</strong> — view</td>
+<td align="center" width="33%"><strong>xlset</strong> — edit</td>
+<td align="center" width="33%"><strong>xlfilter</strong> — query</td>
</tr>
<tr>
<td><img src="demo/xlcat.gif" alt="xlcat demo" /></td>
<td><img src="demo/xlset.gif" alt="xlset demo" /></td>
+<td><img src="demo/xlfilter.gif" alt="xlfilter demo" /></td>
</tr>
</table>
@@ -21,10 +23,11 @@
***
-Two binaries, no runtime dependencies:
+Three binaries, no runtime dependencies:
- **`xlcat`** — view xlsx/xls files as markdown tables or CSV
- **`xlset`** — modify cells in existing xlsx files, preserving formatting
+- **`xlfilter`** — filter, sort, and query rows from spreadsheets
## Installation
@@ -36,12 +39,14 @@ Download from [Releases](https://github.com/LouLouLibs/xl-cli-tools/releases):
# Apple Silicon
curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlcat-aarch64-apple-darwin -o ~/.local/bin/xlcat
curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlset-aarch64-apple-darwin -o ~/.local/bin/xlset
-chmod +x ~/.local/bin/xlcat ~/.local/bin/xlset
+curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlfilter-aarch64-apple-darwin -o ~/.local/bin/xlfilter
+chmod +x ~/.local/bin/xlcat ~/.local/bin/xlset ~/.local/bin/xlfilter
# Intel Mac
curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlcat-x86_64-apple-darwin -o ~/.local/bin/xlcat
curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlset-x86_64-apple-darwin -o ~/.local/bin/xlset
-chmod +x ~/.local/bin/xlcat ~/.local/bin/xlset
+curl -L https://github.com/LouLouLibs/xl-cli-tools/releases/latest/download/xlfilter-x86_64-apple-darwin -o ~/.local/bin/xlfilter
+chmod +x ~/.local/bin/xlcat ~/.local/bin/xlset ~/.local/bin/xlfilter
```
### From source
@@ -115,6 +120,59 @@ xlcat report.xlsx --csv --head 100 > subset.csv
- **Multiple sheets:** lists schemas, pick one with `--sheet`
- **Large file (>1MB):** schema + first 25 rows (override with `--max-size 5M`)
+## xlfilter — Query and Filter
+
+```bash
+# Filter rows by value
+xlfilter data.xlsx --where "State=CA"
+
+# Numeric comparisons
+xlfilter data.xlsx --where "Amount>1000"
+
+# Multiple filters (AND)
+xlfilter data.xlsx --where "State=CA" --where "Amount>1000"
+
+# Select columns (by name or letter)
+xlfilter data.xlsx --cols State,Amount,Year
+xlfilter data.xlsx --cols A,C,D
+
+# Sort results
+xlfilter data.xlsx --sort "Amount:desc"
+
+# Limit output
+xlfilter data.xlsx --sort "Amount:desc" --limit 10
+
+# Contains filter (case-insensitive)
+xlfilter data.xlsx --where "Name~john"
+
+# Head/tail (applied before filtering)
+xlfilter data.xlsx --head 100 --where "Status=Active"
+
+# Skip metadata rows above the real header
+xlfilter data.xlsx --skip 2
+
+# CSV output for piping
+xlfilter data.xlsx --where "Status!=Draft" --csv | other-tool
+
+# Target a specific sheet
+xlfilter data.xlsx --sheet Revenue --where "Amount>5000"
+```
+
+### Filter operators
+
+| Operator | Meaning | Example |
+|----------|---------|---------|
+| `=` | Equals | `State=CA` |
+| `!=` | Not equals | `Status!=Draft` |
+| `>` | Greater than | `Amount>1000` |
+| `<` | Less than | `Year<2024` |
+| `>=` | Greater or equal | `Score>=90` |
+| `<=` | Less or equal | `Price<=50` |
+| `~` | Contains (case-insensitive) | `Name~john` |
+| `!~` | Not contains | `Name!~test` |
+
+Numeric columns compare numerically; string columns compare lexicographically. Row count is printed to stderr.
+
## xlset — Edit Excel Cells
```bash
@@ -171,11 +229,11 @@ Claude Code skills (`/xlcat` and `/xlset`) are available in [claude-skills](http
## Exit codes
-| Code | xlcat | xlset |
-|------|-------|-------|
-| 0 | Success | Success |
-| 1 | Runtime error | Runtime error |
-| 2 | Invalid arguments | Invalid arguments |
+| Code | Meaning |
+|------|---------|
+| 0 | Success |
+| 1 | Runtime error |
+| 2 | Invalid arguments |
## License
diff --git a/demo/xlfilter.gif b/demo/xlfilter.gif
Binary files differ.
diff --git a/demo/xlfilter.tape b/demo/xlfilter.tape
@@ -0,0 +1,60 @@
+# VHS tape for recording xlfilter demo GIF.
+# Usage: vhs demo/xlfilter.tape
+#
+# Prerequisites:
+# - xlfilter binary built and in PATH
+# - Demo xlsx files created: cargo run --example create_demo
+
+Output demo/xlfilter.gif
+
+Set FontSize 14
+Set Width 1100
+Set Height 550
+Set Padding 15
+Set Theme "GruvboxDarkHard"
+
+Set TypingSpeed 80ms
+
+Set Shell "bash"
+
+Hide
+ Type 'export PS1="> "'
+ Enter
+ Type "clear"
+ Enter
+Show
+
+# --- 1. FILTER ROWS ---
+Type "xlfilter demo/sales.xlsx --where Region=East"
+Enter
+Sleep 3s
+
+# --- 2. NUMERIC COMPARISON ---
+Type "xlfilter demo/sales.xlsx --where 'Revenue>12000'"
+Enter
+Sleep 3s
+
+# --- 3. MULTIPLE FILTERS (AND) ---
+Type "xlfilter demo/sales.xlsx --where Region=East --where 'Revenue>10000'"
+Enter
+Sleep 3s
+
+# --- 4. SELECT COLUMNS + SORT ---
+Type "xlfilter demo/sales.xlsx --cols Region,Product,Revenue --sort Revenue:desc"
+Enter
+Sleep 3s
+
+# --- 5. LIMIT OUTPUT ---
+Type "xlfilter demo/sales.xlsx --sort Revenue:desc --limit 3 --cols Product,Revenue"
+Enter
+Sleep 2s
+
+# --- 6. CONTAINS FILTER ---
+Type "xlfilter demo/sales.xlsx --where 'Product~Widget A' --cols Date,Region,Revenue"
+Enter
+Sleep 3s
+
+# --- 7. CSV OUTPUT FOR PIPING ---
+Type "xlfilter demo/sales.xlsx --where Region=West --csv"
+Enter
+Sleep 3s
diff --git a/docs/superpowers/plans/2026-03-17-xldiff.md b/docs/superpowers/plans/2026-03-17-xldiff.md
@@ -0,0 +1,1894 @@
+# xldiff Implementation Plan
+
+> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Port go-xldiff to Rust as the `xldiff` binary in this repo, with type-aware comparison and tolerance support.
+
+**Architecture:** New `src/diff.rs` module for core diff logic (data structures, positional/key-based algorithms, tolerance). New `src/bin/xldiff.rs` binary for CLI parsing, file:sheet argument handling, and four output formatters (text, markdown, json, csv). Reuses existing `reader`, `metadata`, `filter` (for `resolve_column`), and `formatter` modules.
+
+**Tech Stack:** Rust, polars (DataFrames), calamine (reading), clap (CLI), serde_json (JSON output), std::io::IsTerminal (TTY detection)
+
+**Spec:** `docs/superpowers/specs/2026-03-17-xldiff-design.md`
+
+---
+
+### Task 1: Cargo.toml & lib.rs setup
+
+**Files:**
+- Modify: `Cargo.toml`
+- Modify: `src/lib.rs`
+
+- [ ] **Step 1: Add xldiff binary entry and dependencies to Cargo.toml**
+
+Add after the xlfilter `[[bin]]` block:
+
+```toml
+[[bin]]
+name = "xldiff"
+path = "src/bin/xldiff.rs"
+```
+
+Add to `[dependencies]`:
+
+```toml
+serde_json = { version = "1", features = ["preserve_order"] }
+```
+
+- [ ] **Step 2: Add diff module to lib.rs**
+
+```rust
+pub mod cell;
+pub mod diff;
+pub mod filter;
+pub mod formatter;
+pub mod metadata;
+pub mod reader;
+pub mod writer;
+```
+
+- [ ] **Step 3: Create empty diff.rs so the project compiles**
+
+Create `src/diff.rs`:
+
+```rust
+// Diff engine for comparing two Excel sheets.
+```
+
+- [ ] **Step 4: Verify project compiles**
+
+Run: `cargo check`
+Expected: success (no errors)
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add Cargo.toml src/lib.rs src/diff.rs
+git commit -m "chore: add xldiff binary entry, serde deps, empty diff module"
+```
+
+---
+
+### Task 2: Make formatter helpers public
+
+**Files:**
+- Modify: `src/formatter.rs`
+
+The markdown output formatter in xldiff needs `render_table`, `compute_col_widths`, `render_table_header`, `render_table_rows`, and `format_any_value` from `formatter.rs`. These are currently private.
+
+- [ ] **Step 1: Make the five helper functions public**
+
+In `src/formatter.rs`, change:
+
+- `fn compute_col_widths(` → `pub fn compute_col_widths(`
+- `fn render_table_header(` → `pub fn render_table_header(`
+- `fn render_table_rows(` → `pub fn render_table_rows(`
+- `fn render_table(` → `pub fn render_table(`
+- `fn format_any_value(` → `pub fn format_any_value(`
+
+- [ ] **Step 2: Verify project compiles**
+
+Run: `cargo check`
+Expected: success
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/formatter.rs
+git commit -m "refactor: make formatter table helpers public for xldiff reuse"
+```
+
+---
+
+### Task 3: Make `col_letter_to_index` and `resolve_column` available from filter module
+
+**Files:**
+- Modify: `src/filter.rs`
+
+`col_letter_to_index` is currently private in `filter.rs`. The xldiff binary needs `resolve_column` (already public) and `resolve_columns` (already public), which internally use `col_letter_to_index`. Since `resolve_column` is already public, no changes are needed here — xldiff will call `xlcat::filter::resolve_column()` directly.
+
+**This task is a no-op** — confirming that the existing public API is sufficient. Skip to Task 4.
+
+---
+
+### Task 4: diff.rs — data structures
+
+**Files:**
+- Modify: `src/diff.rs`
+
+- [ ] **Step 1: Write the data structures**
+
+```rust
+use polars::prelude::*;
+
+/// Source file and sheet metadata for display.
+#[derive(Debug, Clone)]
+pub struct SheetSource {
+ pub file_name: String,
+ pub sheet_name: String,
+}
+
+/// A single row from an added or removed set.
+#[derive(Debug, Clone)]
+pub struct DiffRow {
+ pub values: Vec<String>,
+}
+
+/// A change in a single cell.
+#[derive(Debug, Clone)]
+pub struct CellChange {
+ pub column: String,
+ pub old_value: String,
+ pub new_value: String,
+}
+
+/// A row present in both files with cell-level differences.
+#[derive(Debug, Clone)]
+pub struct ModifiedRow {
+ pub key: Vec<String>,
+ pub changes: Vec<CellChange>,
+}
+
+/// Result of comparing two sheets.
+#[derive(Debug, Clone)]
+pub struct DiffResult {
+ pub headers: Vec<String>,
+ pub key_columns: Vec<String>,
+ pub added: Vec<DiffRow>,
+ pub removed: Vec<DiffRow>,
+ pub modified: Vec<ModifiedRow>,
+ pub source_a: SheetSource,
+ pub source_b: SheetSource,
+}
+
+impl DiffResult {
+ pub fn has_differences(&self) -> bool {
+ !self.added.is_empty() || !self.removed.is_empty() || !self.modified.is_empty()
+ }
+}
+
+/// Options controlling how the diff is performed.
+#[derive(Debug, Clone, Default)]
+pub struct DiffOptions {
+ pub key_columns: Vec<String>,
+ pub tolerance: Option<f64>,
+}
+```
+
+- [ ] **Step 2: Verify project compiles**
+
+Run: `cargo check`
+Expected: success
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/diff.rs
+git commit -m "feat(xldiff): add diff data structures — DiffResult, DiffRow, ModifiedRow, CellChange"
+```
+
+---
+
+### Task 5: diff.rs — positional mode
+
+**Files:**
+- Modify: `src/diff.rs`
+- Create: `tests/test_xldiff.rs` (unit tests only at first)
+
+- [ ] **Step 1: Write the failing test for positional diff**
+
+Add at the bottom of `src/diff.rs`:
+
+```rust
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn make_source(name: &str) -> SheetSource {
+ SheetSource {
+ file_name: format!("{name}.xlsx"),
+ sheet_name: "Sheet1".to_string(),
+ }
+ }
+
+ #[test]
+ fn test_positional_no_diff() {
+ let s1 = Series::new("name".into(), &["Alice", "Bob"]);
+ let s2 = Series::new("score".into(), &[90i64, 80]);
+ let df_a = DataFrame::new(vec![s1.clone().into_column(), s2.clone().into_column()]).unwrap();
+ let df_b = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
+
+ let opts = DiffOptions::default();
+ let result = diff_positional(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert!(!result.has_differences());
+ assert!(result.added.is_empty());
+ assert!(result.removed.is_empty());
+ }
+
+ #[test]
+ fn test_positional_added_removed() {
+ let df_a = DataFrame::new(vec![
+ Series::new("name".into(), &["Alice", "Bob"]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("name".into(), &["Alice", "Charlie"]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions::default();
+ let result = diff_positional(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert_eq!(result.removed.len(), 1);
+ assert_eq!(result.removed[0].values, vec!["Bob"]);
+ assert_eq!(result.added.len(), 1);
+ assert_eq!(result.added[0].values, vec!["Charlie"]);
+ assert!(result.modified.is_empty());
+ }
+
+ #[test]
+ fn test_positional_duplicate_rows() {
+ let df_a = DataFrame::new(vec![
+ Series::new("x".into(), &["A", "A", "A"]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("x".into(), &["A", "A"]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions::default();
+ let result = diff_positional(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert_eq!(result.removed.len(), 1);
+ assert!(result.added.is_empty());
+ }
+}
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `cargo test --lib diff::tests`
+Expected: FAIL — `diff_positional` not found
+
+- [ ] **Step 3: Implement positional diff**
+
+Add to `src/diff.rs` before the tests module:
+
+```rust
+use crate::formatter;
+use std::collections::HashMap;
+
+/// Format a cell value for hashing/display, using a null sentinel to distinguish null from empty.
+fn cell_to_string(col: &Column, idx: usize) -> String {
+ match col.get(idx) {
+ Ok(AnyValue::Null) | Err(_) => String::new(),
+ Ok(v) => formatter::format_any_value(&v),
+ }
+}
+
+fn cell_to_key_part(col: &Column, idx: usize) -> String {
+ match col.get(idx) {
+ Ok(AnyValue::Null) | Err(_) => "\x01NULL\x01".to_string(),
+ Ok(v) => formatter::format_any_value(&v),
+ }
+}
+
+/// Convert a DataFrame row to a string key by joining cell values with null bytes.
+fn row_to_key(df: &DataFrame, row_idx: usize) -> String {
+ let cols = df.get_columns();
+ let parts: Vec<String> = cols
+ .iter()
+ .map(|c| cell_to_key_part(c, row_idx))
+ .collect();
+ parts.join("\0")
+}
+
+/// Convert a DataFrame row to a Vec<String> of display values.
+fn row_to_strings(df: &DataFrame, row_idx: usize) -> Vec<String> {
+ df.get_columns()
+ .iter()
+ .map(|c| cell_to_string(c, row_idx))
+ .collect()
+}
+
+/// Positional diff: treats entire row as identity using multiset comparison.
+pub fn diff_positional(
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ _opts: &DiffOptions,
+ source_a: SheetSource,
+ source_b: SheetSource,
+) -> anyhow::Result<DiffResult> {
+ let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect();
+ let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect();
+
+ // Warn about mismatched column counts in positional mode
+ if headers_a.len() != headers_b.len() {
+ eprintln!(
+ "warning: column count differs ({} vs {}), padding shorter side with empty values",
+ headers_a.len(), headers_b.len()
+ );
+ }
+
+ // Use headers from the file with more columns
+ let headers = if headers_b.len() > headers_a.len() { headers_b } else { headers_a };
+
+ // Build frequency maps
+ let mut counts_a: HashMap<String, Vec<usize>> = HashMap::new();
+ for i in 0..df_a.height() {
+ counts_a.entry(row_to_key(df_a, i)).or_default().push(i);
+ }
+ let mut counts_b: HashMap<String, Vec<usize>> = HashMap::new();
+ for i in 0..df_b.height() {
+ counts_b.entry(row_to_key(df_b, i)).or_default().push(i);
+ }
+
+ let mut removed = Vec::new();
+ let mut consumed_b: HashMap<String, usize> = HashMap::new();
+
+ // Walk A: try to match with B
+ for i in 0..df_a.height() {
+ let key = row_to_key(df_a, i);
+ let b_count = counts_b.get(&key).map(|v| v.len()).unwrap_or(0);
+ let already_consumed = consumed_b.get(&key).copied().unwrap_or(0);
+ if already_consumed < b_count {
+ *consumed_b.entry(key).or_insert(0) += 1;
+ } else {
+ let mut vals = row_to_strings(df_a, i);
+ vals.resize(headers.len(), String::new()); // pad if needed
+ removed.push(DiffRow { values: vals });
+ }
+ }
+
+ // Walk B: anything not consumed is added
+ let mut consumed_a: HashMap<String, usize> = HashMap::new();
+ let mut added = Vec::new();
+ for i in 0..df_b.height() {
+ let key = row_to_key(df_b, i);
+ let a_count = counts_a.get(&key).map(|v| v.len()).unwrap_or(0);
+ let already_consumed = consumed_a.get(&key).copied().unwrap_or(0);
+ if already_consumed < a_count {
+ *consumed_a.entry(key).or_insert(0) += 1;
+ } else {
+ let mut vals = row_to_strings(df_b, i);
+ vals.resize(headers.len(), String::new()); // pad if needed
+ added.push(DiffRow { values: vals });
+ }
+ }
+
+ Ok(DiffResult {
+ headers,
+ key_columns: vec![],
+ added,
+ removed,
+ modified: vec![],
+ source_a,
+ source_b,
+ })
+}
+```
+
+- [ ] **Step 4: Run tests to verify they pass**
+
+Run: `cargo test --lib diff::tests`
+Expected: all 3 tests PASS
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/diff.rs
+git commit -m "feat(xldiff): implement positional diff with multiset comparison"
+```
+
+---
+
+### Task 6: diff.rs — key-based mode
+
+**Files:**
+- Modify: `src/diff.rs`
+
+- [ ] **Step 1: Write failing tests for key-based diff**
+
+Add to the `tests` module in `src/diff.rs`:
+
+```rust
+ #[test]
+ fn test_keyed_no_diff() {
+ let df_a = DataFrame::new(vec![
+ Series::new("id".into(), &["1", "2"]).into_column(),
+ Series::new("val".into(), &[10i64, 20]).into_column(),
+ ]).unwrap();
+ let df_b = df_a.clone();
+
+ let opts = DiffOptions { key_columns: vec!["id".to_string()], ..Default::default() };
+ let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert!(!result.has_differences());
+ }
+
+ #[test]
+ fn test_keyed_added_removed() {
+ let df_a = DataFrame::new(vec![
+ Series::new("id".into(), &["1", "2"]).into_column(),
+ Series::new("val".into(), &[10i64, 20]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("id".into(), &["2", "3"]).into_column(),
+ Series::new("val".into(), &[20i64, 30]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions { key_columns: vec!["id".to_string()], ..Default::default() };
+ let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert_eq!(result.removed.len(), 1);
+ assert_eq!(result.added.len(), 1);
+ assert_eq!(result.removed[0].values, vec!["1", "10"]);
+ assert_eq!(result.added[0].values, vec!["3", "30"]);
+ }
+
+ #[test]
+ fn test_keyed_modified() {
+ let df_a = DataFrame::new(vec![
+ Series::new("id".into(), &["1", "2"]).into_column(),
+ Series::new("score".into(), &[90i64, 80]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("id".into(), &["1", "2"]).into_column(),
+ Series::new("score".into(), &[95i64, 80]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions { key_columns: vec!["id".to_string()], ..Default::default() };
+ let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert!(result.added.is_empty());
+ assert!(result.removed.is_empty());
+ assert_eq!(result.modified.len(), 1);
+ assert_eq!(result.modified[0].key, vec!["1"]);
+ assert_eq!(result.modified[0].changes.len(), 1);
+ assert_eq!(result.modified[0].changes[0].column, "score");
+ assert_eq!(result.modified[0].changes[0].old_value, "90");
+ assert_eq!(result.modified[0].changes[0].new_value, "95");
+ }
+
+ #[test]
+ fn test_keyed_composite_key() {
+ let df_a = DataFrame::new(vec![
+ Series::new("date".into(), &["2024-01", "2024-01"]).into_column(),
+ Series::new("ticker".into(), &["AAPL", "GOOG"]).into_column(),
+ Series::new("price".into(), &[150i64, 100]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("date".into(), &["2024-01", "2024-01"]).into_column(),
+ Series::new("ticker".into(), &["AAPL", "GOOG"]).into_column(),
+ Series::new("price".into(), &[155i64, 100]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions {
+ key_columns: vec!["date".to_string(), "ticker".to_string()],
+ ..Default::default()
+ };
+ let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert_eq!(result.modified.len(), 1);
+ assert_eq!(result.modified[0].key, vec!["2024-01", "AAPL"]);
+ }
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `cargo test --lib diff::tests`
+Expected: FAIL — `diff_keyed` not found
+
+- [ ] **Step 3: Implement key-based diff**
+
+Add to `src/diff.rs`:
+
+```rust
+/// Row data stored during key-based comparison.
+struct KeyedRow {
+ values: Vec<String>,
+ key_values: Vec<String>,
+}
+
+/// Key-based diff: match rows by key columns, compare remaining cells.
+pub fn diff_keyed(
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ opts: &DiffOptions,
+ source_a: SheetSource,
+ source_b: SheetSource,
+) -> anyhow::Result<DiffResult> {
+ let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect();
+ let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect();
+
+ // Resolve key column indices for both frames
+ let key_indices_a: Vec<usize> = opts.key_columns.iter()
+ .map(|k| headers_a.iter().position(|h| h == k)
+ .ok_or_else(|| anyhow::anyhow!("key column '{}' not found in first file", k)))
+ .collect::<anyhow::Result<Vec<_>>>()?;
+
+ let key_indices_b: Vec<usize> = opts.key_columns.iter()
+ .map(|k| headers_b.iter().position(|h| h == k)
+ .ok_or_else(|| anyhow::anyhow!("key column '{}' not found in second file", k)))
+ .collect::<anyhow::Result<Vec<_>>>()?;
+
+ // Find common non-key columns for comparison
+ let non_key_a: Vec<String> = headers_a.iter()
+ .filter(|h| !opts.key_columns.contains(h))
+ .cloned().collect();
+ let non_key_b: Vec<String> = headers_b.iter()
+ .filter(|h| !opts.key_columns.contains(h))
+ .cloned().collect();
+ let common_non_key: Vec<String> = non_key_a.iter()
+ .filter(|h| non_key_b.contains(h))
+ .cloned().collect();
+
+ // Warn about columns only in one file
+ for col in &non_key_a {
+ if !non_key_b.contains(col) {
+ eprintln!("warning: column '{}' only in first file, skipping", col);
+ }
+ }
+ for col in &non_key_b {
+ if !non_key_a.contains(col) {
+ eprintln!("warning: column '{}' only in second file, skipping", col);
+ }
+ }
+
+ // Headers = key columns + union of non-key columns (common first, then only-in-A, only-in-B)
+ let mut headers = opts.key_columns.clone();
+ headers.extend(non_key_a.iter().chain(
+ non_key_b.iter().filter(|h| !non_key_a.contains(h))
+ ).cloned());
+
+ // Build row maps
+ let map_a = build_key_map(df_a, &key_indices_a);
+ let map_b = build_key_map(df_b, &key_indices_b);
+
+ // Detect duplicate keys
+ check_duplicate_keys(df_a, &key_indices_a, &source_a);
+ check_duplicate_keys(df_b, &key_indices_b, &source_b);
+
+ let mut added = Vec::new();
+ let mut removed = Vec::new();
+ let mut modified = Vec::new();
+
+ // Removed: keys in A but not B
+ for (key_str, row_a) in &map_a {
+ if !map_b.contains_key(key_str) {
+ removed.push(DiffRow { values: row_a.values.clone() });
+ }
+ }
+
+ // Added: keys in B but not A; Modified: keys in both
+ for (key_str, row_b) in &map_b {
+ match map_a.get(key_str) {
+ None => {
+ added.push(DiffRow { values: row_b.values.clone() });
+ }
+ Some(row_a) => {
+ let changes = compare_rows(
+ df_a, df_b,
+ &headers_a, &headers_b,
+ row_a, row_b,
+ &common_non_key,
+ opts,
+ );
+ if !changes.is_empty() {
+ modified.push(ModifiedRow {
+ key: row_a.key_values.clone(),
+ changes,
+ });
+ }
+ }
+ }
+ }
+
+ Ok(DiffResult {
+ headers,
+ key_columns: opts.key_columns.clone(),
+ added,
+ removed,
+ modified,
+ source_a,
+ source_b,
+ })
+}
+
+fn build_key_map(df: &DataFrame, key_indices: &[usize]) -> HashMap<String, KeyedRow> {
+ let cols = df.get_columns();
+ let mut map = HashMap::new();
+ for i in 0..df.height() {
+ let key_values: Vec<String> = key_indices.iter()
+ .map(|&idx| formatter::format_any_value(&cols[idx].get(i).unwrap_or(AnyValue::Null)))
+ .collect();
+ let key_str = key_values.join("\0");
+ let values: Vec<String> = cols.iter()
+ .map(|c| formatter::format_any_value(&c.get(i).unwrap_or(AnyValue::Null)))
+ .collect();
+ map.insert(key_str, KeyedRow { values, key_values });
+ }
+ map
+}
+
+fn check_duplicate_keys(df: &DataFrame, key_indices: &[usize], source: &SheetSource) {
+ let cols = df.get_columns();
+ let mut seen: HashMap<String, usize> = HashMap::new();
+ for i in 0..df.height() {
+ let key: Vec<String> = key_indices.iter()
+ .map(|&idx| formatter::format_any_value(&cols[idx].get(i).unwrap_or(AnyValue::Null)))
+ .collect();
+ let key_str = key.join(", ");
+ let count = seen.entry(key_str.clone()).or_insert(0);
+ *count += 1;
+ if *count == 2 {
+ eprintln!("warning: duplicate key \"{}\" in {}", key_str, source.file_name);
+ }
+ }
+}
+
+/// Compare non-key columns of two matched rows.
+fn compare_rows(
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ headers_a: &[String],
+ headers_b: &[String],
+ row_a: &KeyedRow,
+ row_b: &KeyedRow,
+ common_columns: &[String],
+ opts: &DiffOptions,
+) -> Vec<CellChange> {
+ let mut changes = Vec::new();
+ for col_name in common_columns {
+ let idx_a = headers_a.iter().position(|h| h == col_name).unwrap();
+ let idx_b = headers_b.iter().position(|h| h == col_name).unwrap();
+ let val_a = &row_a.values[idx_a];
+ let val_b = &row_b.values[idx_b];
+
+ let is_equal = if let Some(tol) = opts.tolerance {
+ values_equal_with_tolerance(val_a, val_b, tol, df_a, df_b, col_name)
+ } else {
+ val_a == val_b
+ };
+
+ if !is_equal {
+ changes.push(CellChange {
+ column: col_name.clone(),
+ old_value: val_a.clone(),
+ new_value: val_b.clone(),
+ });
+ }
+ }
+ changes
+}
+
+fn is_float_dtype(dt: &DataType) -> bool {
+ matches!(dt, DataType::Float32 | DataType::Float64)
+}
+
+fn is_int_dtype(dt: &DataType) -> bool {
+ matches!(dt, DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64
+ | DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64)
+}
+
+/// Check if two values are equal, applying tolerance for float columns.
+/// Tolerance applies to Float64 columns and Int64-vs-Float64 cross-type,
+/// but NOT to pure integer columns (spec requirement).
+fn values_equal_with_tolerance(
+ val_a: &str,
+ val_b: &str,
+ tolerance: f64,
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ col_name: &str,
+) -> bool {
+ if let (Ok(a), Ok(b)) = (val_a.parse::<f64>(), val_b.parse::<f64>()) {
+ // Handle NaN
+ if a.is_nan() && b.is_nan() {
+ return true;
+ }
+ if a.is_nan() || b.is_nan() {
+ return false;
+ }
+ let type_a = df_a.column(col_name).ok().map(|c| c.dtype().clone());
+ let type_b = df_b.column(col_name).ok().map(|c| c.dtype().clone());
+
+ let a_is_float = type_a.as_ref().map_or(false, is_float_dtype);
+ let b_is_float = type_b.as_ref().map_or(false, is_float_dtype);
+ let a_is_int = type_a.as_ref().map_or(false, is_int_dtype);
+ let b_is_int = type_b.as_ref().map_or(false, is_int_dtype);
+
+ // Pure integer columns: exact comparison, no tolerance
+ if a_is_int && b_is_int {
+ return val_a == val_b;
+ }
+ // At least one float, or cross-type int/float: apply tolerance
+ if a_is_float || b_is_float {
+ return (a - b).abs() <= tolerance;
+ }
+ }
+ // Fallback: exact string comparison
+ val_a == val_b
+}
+```
+
+- [ ] **Step 4: Run tests to verify they pass**
+
+Run: `cargo test --lib diff::tests`
+Expected: all 7 tests PASS
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/diff.rs
+git commit -m "feat(xldiff): implement key-based diff with composite keys and duplicate detection"
+```
+
+---
+
+### Task 7: diff.rs — tolerance tests
+
+**Files:**
+- Modify: `src/diff.rs`
+
+- [ ] **Step 1: Write tolerance tests**
+
+Add to the `tests` module:
+
+```rust
+ #[test]
+ fn test_keyed_tolerance_within() {
+ let df_a = DataFrame::new(vec![
+ Series::new("id".into(), &["1"]).into_column(),
+ Series::new("price".into(), &[100.001f64]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("id".into(), &["1"]).into_column(),
+ Series::new("price".into(), &[100.002f64]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions {
+ key_columns: vec!["id".to_string()],
+ tolerance: Some(0.01),
+ };
+ let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert!(!result.has_differences(), "values within tolerance should be equal");
+ }
+
+ #[test]
+ fn test_keyed_tolerance_exceeded() {
+ let df_a = DataFrame::new(vec![
+ Series::new("id".into(), &["1"]).into_column(),
+ Series::new("price".into(), &[100.0f64]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("id".into(), &["1"]).into_column(),
+ Series::new("price".into(), &[100.05f64]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions {
+ key_columns: vec!["id".to_string()],
+ tolerance: Some(0.01),
+ };
+ let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert_eq!(result.modified.len(), 1);
+ }
+
+ #[test]
+ fn test_keyed_nan_handling() {
+ let df_a = DataFrame::new(vec![
+ Series::new("id".into(), &["1"]).into_column(),
+ Series::new("val".into(), &[f64::NAN]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("id".into(), &["1"]).into_column(),
+ Series::new("val".into(), &[f64::NAN]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions {
+ key_columns: vec!["id".to_string()],
+ tolerance: Some(0.01),
+ };
+ let result = diff_keyed(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert!(!result.has_differences(), "NaN == NaN should be equal");
+ }
+```
+
+- [ ] **Step 2: Run tests**
+
+Run: `cargo test --lib diff::tests`
+Expected: all 10 tests PASS
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/diff.rs
+git commit -m "test(xldiff): add tolerance and NaN handling tests"
+```
+
+---
+
+### Task 8: diff.rs — public entry point `diff_sheets`
+
+**Files:**
+- Modify: `src/diff.rs`
+
+- [ ] **Step 1: Write test for the unified entry point**
+
+Add to tests:
+
+```rust
+ #[test]
+ fn test_diff_sheets_positional() {
+ let df_a = DataFrame::new(vec![
+ Series::new("x".into(), &["a", "b"]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("x".into(), &["a", "c"]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions::default(); // no key → positional
+ let result = diff_sheets(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert_eq!(result.removed.len(), 1);
+ assert_eq!(result.added.len(), 1);
+ }
+
+ #[test]
+ fn test_diff_sheets_keyed() {
+ let df_a = DataFrame::new(vec![
+ Series::new("id".into(), &["1"]).into_column(),
+ Series::new("v".into(), &[10i64]).into_column(),
+ ]).unwrap();
+ let df_b = DataFrame::new(vec![
+ Series::new("id".into(), &["1"]).into_column(),
+ Series::new("v".into(), &[20i64]).into_column(),
+ ]).unwrap();
+
+ let opts = DiffOptions { key_columns: vec!["id".to_string()], ..Default::default() };
+ let result = diff_sheets(&df_a, &df_b, &opts, make_source("a"), make_source("b")).unwrap();
+ assert_eq!(result.modified.len(), 1);
+ }
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `cargo test --lib diff::tests::test_diff_sheets`
+Expected: FAIL
+
+- [ ] **Step 3: Implement `diff_sheets`**
+
+Add to `src/diff.rs`:
+
+```rust
+/// Compare two DataFrames. Dispatches to positional or key-based mode.
+pub fn diff_sheets(
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ opts: &DiffOptions,
+ source_a: SheetSource,
+ source_b: SheetSource,
+) -> anyhow::Result<DiffResult> {
+ if opts.key_columns.is_empty() {
+ diff_positional(df_a, df_b, opts, source_a, source_b)
+ } else {
+ diff_keyed(df_a, df_b, opts, source_a, source_b)
+ }
+}
+```
+
+- [ ] **Step 4: Run tests**
+
+Run: `cargo test --lib diff::tests`
+Expected: all 12 tests PASS
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/diff.rs
+git commit -m "feat(xldiff): add diff_sheets entry point dispatching positional/keyed modes"
+```
+
+---
+
+### Task 9: xldiff.rs — CLI skeleton + file:sheet parsing
+
+**Files:**
+- Create: `src/bin/xldiff.rs`
+
+- [ ] **Step 1: Write the CLI binary with clap args and file:sheet parsing**
+
+Create `src/bin/xldiff.rs`:
+
+```rust
+use std::io::IsTerminal;
+use std::path::PathBuf;
+use std::process;
+
+use anyhow::Result;
+use clap::Parser;
+
+use xlcat::diff::{self, DiffOptions, DiffResult, SheetSource};
+use xlcat::filter;
+use xlcat::metadata;
+use xlcat::reader;
+
+#[derive(Parser)]
+#[command(
+ name = "xldiff",
+ about = "Compare two Excel sheets and report differences",
+ version
+)]
+struct Args {
+ /// First file, optionally with :sheet suffix (e.g., old.xlsx:Sales)
+ file_a: String,
+
+ /// Second file, optionally with :sheet suffix
+ file_b: String,
+
+ /// Comma-separated key columns for row matching (name or column letter).
+ /// Omit for positional (whole-row) comparison.
+ #[arg(long)]
+ key: Option<String>,
+
+ /// Comma-separated columns to compare (name or column letter).
+ /// Default: all columns. Key columns are always included.
+ #[arg(long)]
+ cols: Option<String>,
+
+ /// Rows to skip before header. Single value for both files, or N,M for independent skips.
+ #[arg(long, default_value = "0")]
+ skip: String,
+
+ /// Treat first row as data; generate synthetic column_0, column_1, … headers.
+ /// NOTE: declared for CLI compatibility but not yet implemented — follow-up task.
+ #[arg(long)]
+ no_header: bool,
+
+ /// Numeric tolerance for float comparisons (e.g., 0.01)
+ #[arg(long)]
+ tolerance: Option<f64>,
+
+ /// Output format: text (default), markdown, json, csv
+ #[arg(long, default_value = "text")]
+ format: String,
+
+ /// Force-disable ANSI colors
+ #[arg(long)]
+ no_color: bool,
+}
+
+/// Parse a file argument like "path.xlsx:Sheet1" into (path, Option<sheet>).
+/// Uses rfind(':') to handle Windows drive letters (C:\...).
+fn parse_file_arg(arg: &str) -> (PathBuf, Option<String>) {
+ if let Some(colon_pos) = arg.rfind(':') {
+ // Don't split if the colon is part of a Windows drive letter (e.g., C:\)
+ // A drive letter colon is at position 1 and followed by \ or /
+ if colon_pos == 1 && arg.len() > 2 && (arg.as_bytes()[2] == b'\\' || arg.as_bytes()[2] == b'/') {
+ return (PathBuf::from(arg), None);
+ }
+ // Don't split if nothing after the colon
+ if colon_pos == arg.len() - 1 {
+ return (PathBuf::from(&arg[..colon_pos]), None);
+ }
+ let path = PathBuf::from(&arg[..colon_pos]);
+ let sheet = arg[colon_pos + 1..].to_string();
+ (path, Some(sheet))
+ } else {
+ (PathBuf::from(arg), None)
+ }
+}
+
+/// Parse the --skip flag: "3" or "3,5"
+fn parse_skip(s: &str) -> Result<(usize, usize)> {
+ if let Some((a, b)) = s.split_once(',') {
+ let skip_a = a.trim().parse::<usize>().map_err(|_| anyhow::anyhow!("invalid skip value: {}", a))?;
+ let skip_b = b.trim().parse::<usize>().map_err(|_| anyhow::anyhow!("invalid skip value: {}", b))?;
+ Ok((skip_a, skip_b))
+ } else {
+ let skip = s.trim().parse::<usize>().map_err(|_| anyhow::anyhow!("invalid skip value: {}", s))?;
+ Ok((skip, skip))
+ }
+}
+
+/// Resolve sheet name from user input or default to first sheet.
+fn resolve_sheet(info: &metadata::FileInfo, sheet_arg: Option<&str>) -> Result<String> {
+ match sheet_arg {
+ None => info.sheets.first()
+ .map(|s| s.name.clone())
+ .ok_or_else(|| anyhow::anyhow!("workbook has no sheets")),
+ Some(s) => {
+ // Try exact name match
+ if let Some(sheet) = info.sheets.iter().find(|si| si.name == s) {
+ return Ok(sheet.name.clone());
+ }
+ // Try as 0-based index
+ if let Ok(idx) = s.parse::<usize>() {
+ if let Some(sheet) = info.sheets.get(idx) {
+ return Ok(sheet.name.clone());
+ }
+ }
+ let names: Vec<_> = info.sheets.iter().map(|s| s.name.as_str()).collect();
+ anyhow::bail!("sheet '{}' not found. Available: {}", s, names.join(", "))
+ }
+ }
+}
+
+fn run(args: Args) -> Result<()> {
+ let (path_a, sheet_sel_a) = parse_file_arg(&args.file_a);
+ let (path_b, sheet_sel_b) = parse_file_arg(&args.file_b);
+
+ // Validate files exist
+ if !path_a.exists() {
+ anyhow::bail!("file not found: {}", path_a.display());
+ }
+ if !path_b.exists() {
+ anyhow::bail!("file not found: {}", path_b.display());
+ }
+
+ // Validate format
+ let format = match args.format.as_str() {
+ "text" | "markdown" | "json" | "csv" => args.format.as_str(),
+ other => anyhow::bail!("unknown format '{}'. Use: text, markdown, json, csv", other),
+ };
+
+ // Parse skip
+ let (skip_a, skip_b) = parse_skip(&args.skip)?;
+
+ // Resolve sheets
+ let info_a = metadata::read_file_info(&path_a)?;
+ let info_b = metadata::read_file_info(&path_b)?;
+ let sheet_a = resolve_sheet(&info_a, sheet_sel_a.as_deref())?;
+ let sheet_b = resolve_sheet(&info_b, sheet_sel_b.as_deref())?;
+
+ // Read DataFrames
+ let df_a = if skip_a > 0 {
+ reader::read_sheet_with_skip(&path_a, &sheet_a, skip_a)?
+ } else {
+ reader::read_sheet(&path_a, &sheet_a)?
+ };
+ let df_b = if skip_b > 0 {
+ reader::read_sheet_with_skip(&path_b, &sheet_b, skip_b)?
+ } else {
+ reader::read_sheet(&path_b, &sheet_b)?
+ };
+
+ // Resolve key columns (validate they exist in both files)
+ let key_columns: Vec<String> = if let Some(ref key_str) = args.key {
+ let specs: Vec<String> = key_str.split(',').map(|s| s.trim().to_string()).collect();
+ let cols_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect();
+ let cols_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect();
+ let resolved = filter::resolve_columns(&specs, &cols_a)
+ .map_err(|e| anyhow::anyhow!("first file: {}", e))?;
+ // Validate key columns also exist in second file
+ for k in &resolved {
+ if !cols_b.iter().any(|c| c == k) {
+ anyhow::bail!("key column '{}' not found in second file", k);
+ }
+ }
+ resolved
+ } else {
+ vec![]
+ };
+
+ // Column filtering
+ let (df_a, df_b) = if let Some(ref cols_str) = args.cols {
+ let specs: Vec<String> = cols_str.split(',').map(|s| s.trim().to_string()).collect();
+ let cols_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect();
+ let mut selected = filter::resolve_columns(&specs, &cols_a)
+ .map_err(|e| anyhow::anyhow!(e))?;
+ // Ensure key columns are included
+ for k in &key_columns {
+ if !selected.contains(k) {
+ selected.insert(0, k.clone());
+ }
+ }
+ let sel_refs: Vec<&str> = selected.iter().map(|s| s.as_str()).collect();
+ let df_a = df_a.select(&sel_refs)?;
+ let df_b = df_b.select(&sel_refs)?;
+ (df_a, df_b)
+ } else {
+ (df_a, df_b)
+ };
+
+ let source_a = SheetSource {
+ file_name: path_a.file_name().unwrap_or_default().to_string_lossy().to_string(),
+ sheet_name: sheet_a,
+ };
+ let source_b = SheetSource {
+ file_name: path_b.file_name().unwrap_or_default().to_string_lossy().to_string(),
+ sheet_name: sheet_b,
+ };
+
+ let opts = DiffOptions {
+ key_columns,
+ tolerance: args.tolerance,
+ };
+
+ let result = diff::diff_sheets(&df_a, &df_b, &opts, source_a, source_b)?;
+
+ // Determine color
+ let use_color = !args.no_color && std::io::stdout().is_terminal();
+
+ // Format and print
+ let output = match format {
+ "text" => format_text(&result, use_color),
+ "markdown" => format_markdown(&result),
+ "json" => format_json(&result),
+ "csv" => format_csv(&result),
+ _ => unreachable!(),
+ };
+ print!("{output}");
+
+ // Exit code
+ if result.has_differences() {
+ process::exit(1);
+ }
+ Ok(())
+}
+
+fn main() {
+ let args = Args::parse();
+ if let Err(err) = run(args) {
+ eprintln!("xldiff: {err}");
+ process::exit(2);
+ }
+}
+
+// Output formatters — implemented in subsequent tasks
+fn format_text(_result: &DiffResult, _color: bool) -> String { todo!() }
+fn format_markdown(_result: &DiffResult) -> String { todo!() }
+fn format_json(_result: &DiffResult) -> String { todo!() }
+fn format_csv(_result: &DiffResult) -> String { todo!() }
+```
+
+- [ ] **Step 2: Verify it compiles**
+
+Run: `cargo check --bin xldiff`
+Expected: success (todo!() stubs compile but will panic at runtime)
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/bin/xldiff.rs
+git commit -m "feat(xldiff): add CLI skeleton with file:sheet parsing, skip, key/cols resolution"
+```
+
+---
+
+### Task 10: xldiff.rs — text output formatter
+
+**Files:**
+- Modify: `src/bin/xldiff.rs`
+
+- [ ] **Step 1: Implement `format_text`**
+
+Replace the `format_text` stub:
+
+```rust
+fn format_text(result: &DiffResult, color: bool) -> String {
+ if !result.has_differences() {
+ return "No differences found.\n".to_string();
+ }
+
+ let mut out = String::new();
+
+ // Header
+ let (red, green, yellow, reset) = if color {
+ ("\x1b[31m", "\x1b[32m", "\x1b[33m", "\x1b[0m")
+ } else {
+ ("", "", "", "")
+ };
+
+ out.push_str(&format!(
+ "--- {} ({})\n+++ {} ({})\n\n",
+ result.source_a.sheet_name, result.source_a.file_name,
+ result.source_b.sheet_name, result.source_b.file_name,
+ ));
+
+ // Summary
+ out.push_str(&format!(
+ "Added: {} | Removed: {} | Modified: {}\n\n",
+ result.added.len(),
+ result.removed.len(),
+ result.modified.len(),
+ ));
+
+ // Removed rows
+ for row in &result.removed {
+ out.push_str(red);
+ out.push_str("- ");
+ out.push_str(&format_row_inline(&result.headers, &row.values));
+ out.push_str(reset);
+ out.push('\n');
+ }
+
+ // Added rows
+ for row in &result.added {
+ out.push_str(green);
+ out.push_str("+ ");
+ out.push_str(&format_row_inline(&result.headers, &row.values));
+ out.push_str(reset);
+ out.push('\n');
+ }
+
+ if (!result.removed.is_empty() || !result.added.is_empty()) && !result.modified.is_empty() {
+ out.push('\n');
+ }
+
+ // Modified rows
+ for m in &result.modified {
+ out.push_str(yellow);
+ out.push_str("~ ");
+ // Show key
+ let key_display: Vec<String> = result.key_columns.iter()
+ .zip(m.key.iter())
+ .map(|(col, val)| format!("{}: \"{}\"", col, val))
+ .collect();
+ out.push_str(&key_display.join(" "));
+ out.push_str(reset);
+ out.push('\n');
+ for change in &m.changes {
+ out.push_str(yellow);
+ out.push_str(&format!(" {}: \"{}\" → \"{}\"", change.column, change.old_value, change.new_value));
+ out.push_str(reset);
+ out.push('\n');
+ }
+ }
+
+ out
+}
+
+/// Format a row as inline key-value pairs: Name: "Alice" Score: "90"
+fn format_row_inline(headers: &[String], values: &[String]) -> String {
+ headers.iter()
+ .zip(values.iter())
+ .map(|(h, v)| format!("{}: \"{}\"", h, v))
+ .collect::<Vec<_>>()
+ .join(" ")
+}
+```
+
+- [ ] **Step 2: Verify it compiles**
+
+Run: `cargo check --bin xldiff`
+Expected: success
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/bin/xldiff.rs
+git commit -m "feat(xldiff): implement colored text output formatter"
+```
+
+---
+
+### Task 11: xldiff.rs — markdown output formatter
+
+**Files:**
+- Modify: `src/bin/xldiff.rs`
+
+- [ ] **Step 1: Implement `format_markdown`**
+
+Replace the `format_markdown` stub:
+
+```rust
+fn format_markdown(result: &DiffResult) -> String {
+ use xlcat::formatter;
+
+ if !result.has_differences() {
+ return "No differences found.\n".to_string();
+ }
+
+ let mut out = String::new();
+
+ // Added
+ if !result.added.is_empty() {
+ out.push_str(&format!("## Added ({})\n", result.added.len()));
+ let rows: Vec<Vec<String>> = result.added.iter().map(|r| r.values.clone()).collect();
+ out.push_str(&formatter::render_table(&result.headers, &rows));
+ out.push('\n');
+ }
+
+ // Removed
+ if !result.removed.is_empty() {
+ out.push_str(&format!("## Removed ({})\n", result.removed.len()));
+ let rows: Vec<Vec<String>> = result.removed.iter().map(|r| r.values.clone()).collect();
+ out.push_str(&formatter::render_table(&result.headers, &rows));
+ out.push('\n');
+ }
+
+ // Modified
+ if !result.modified.is_empty() {
+ out.push_str(&format!("## Modified ({})\n", result.modified.len()));
+ let key_label = if result.key_columns.len() == 1 {
+ format!("Key ({})", result.key_columns[0])
+ } else {
+ format!("Key ({})", result.key_columns.join(", "))
+ };
+ let mod_headers = vec![key_label, "Column".to_string(), "Old".to_string(), "New".to_string()];
+ let mut mod_rows = Vec::new();
+ for m in &result.modified {
+ let key_str = m.key.join(", ");
+ for (i, change) in m.changes.iter().enumerate() {
+ let key_cell = if i == 0 { key_str.clone() } else { String::new() };
+ mod_rows.push(vec![
+ key_cell,
+ change.column.clone(),
+ change.old_value.clone(),
+ change.new_value.clone(),
+ ]);
+ }
+ }
+ out.push_str(&formatter::render_table(&mod_headers, &mod_rows));
+ out.push('\n');
+ }
+
+ out
+}
+```
+
+- [ ] **Step 2: Verify it compiles**
+
+Run: `cargo check --bin xldiff`
+Expected: success
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/bin/xldiff.rs
+git commit -m "feat(xldiff): implement markdown output formatter"
+```
+
+---
+
+### Task 12: xldiff.rs — JSON output formatter
+
+**Files:**
+- Modify: `src/bin/xldiff.rs`
+
+- [ ] **Step 1: Implement `format_json`**
+
+Replace the `format_json` stub:
+
+```rust
+fn format_json(result: &DiffResult) -> String {
+ use serde_json::{json, Map, Value};
+
+ let row_to_obj = |row: &diff::DiffRow| -> Value {
+ let mut map = Map::new();
+ for (h, v) in result.headers.iter().zip(row.values.iter()) {
+ map.insert(h.clone(), Value::String(v.clone()));
+ }
+ Value::Object(map)
+ };
+
+ let added: Vec<Value> = result.added.iter().map(row_to_obj).collect();
+ let removed: Vec<Value> = result.removed.iter().map(row_to_obj).collect();
+
+ let modified: Vec<Value> = result.modified.iter().map(|m| {
+ let mut key_map = Map::new();
+ for (col, val) in result.key_columns.iter().zip(m.key.iter()) {
+ key_map.insert(col.clone(), Value::String(val.clone()));
+ }
+ let changes: Vec<Value> = m.changes.iter().map(|c| {
+ json!({
+ "column": c.column,
+ "old": c.old_value,
+ "new": c.new_value,
+ })
+ }).collect();
+ json!({
+ "key": Value::Object(key_map),
+ "changes": changes,
+ })
+ }).collect();
+
+ let obj = json!({
+ "added": added,
+ "removed": removed,
+ "modified": modified,
+ });
+
+ serde_json::to_string_pretty(&obj).unwrap() + "\n"
+}
+```
+
+- [ ] **Step 2: Verify it compiles**
+
+Run: `cargo check --bin xldiff`
+Expected: success
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/bin/xldiff.rs
+git commit -m "feat(xldiff): implement JSON output formatter"
+```
+
+---
+
+### Task 13: xldiff.rs — CSV output formatter
+
+**Files:**
+- Modify: `src/bin/xldiff.rs`
+
+- [ ] **Step 1: Implement `format_csv`**
+
+Replace the `format_csv` stub:
+
+```rust
+fn format_csv(result: &DiffResult) -> String {
+ let mut out = String::new();
+
+ // Header: _status, col1, col2, ..., _old_col1, _old_col2, ...
+ let mut header_parts = vec!["_status".to_string()];
+ header_parts.extend(result.headers.clone());
+ for h in &result.headers {
+ header_parts.push(format!("_old_{}", h));
+ }
+ out.push_str(&csv_row(&header_parts));
+
+ // Added rows
+ for row in &result.added {
+ let mut parts = vec!["added".to_string()];
+ parts.extend(row.values.clone());
+ // Empty _old_ columns
+ for _ in &result.headers {
+ parts.push(String::new());
+ }
+ out.push_str(&csv_row(&parts));
+ }
+
+ // Removed rows
+ for row in &result.removed {
+ let mut parts = vec!["removed".to_string()];
+ parts.extend(row.values.clone());
+ // Empty _old_ columns
+ for _ in &result.headers {
+ parts.push(String::new());
+ }
+ out.push_str(&csv_row(&parts));
+ }
+
+ // Modified rows
+ for m in &result.modified {
+ let mut main_vals = vec![String::new(); result.headers.len()];
+ let mut old_vals = vec![String::new(); result.headers.len()];
+
+ // Fill key columns in main values
+ for (key_col, key_val) in result.key_columns.iter().zip(m.key.iter()) {
+ if let Some(idx) = result.headers.iter().position(|h| h == key_col) {
+ main_vals[idx] = key_val.clone();
+ }
+ }
+
+ // Fill changed columns
+ for change in &m.changes {
+ if let Some(idx) = result.headers.iter().position(|h| h == &change.column) {
+ main_vals[idx] = change.new_value.clone();
+ old_vals[idx] = change.old_value.clone();
+ }
+ }
+
+ let mut parts = vec!["modified".to_string()];
+ parts.extend(main_vals);
+ parts.extend(old_vals);
+ out.push_str(&csv_row(&parts));
+ }
+
+ out
+}
+
+/// Format a single CSV row with RFC 4180 quoting.
+fn csv_row(values: &[String]) -> String {
+ let escaped: Vec<String> = values.iter().map(|v| {
+ if v.contains(',') || v.contains('"') || v.contains('\n') {
+ format!("\"{}\"", v.replace('"', "\"\""))
+ } else {
+ v.clone()
+ }
+ }).collect();
+ escaped.join(",") + "\n"
+}
+```
+
+- [ ] **Step 2: Verify it compiles**
+
+Run: `cargo check --bin xldiff`
+Expected: success
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/bin/xldiff.rs
+git commit -m "feat(xldiff): implement CSV output formatter"
+```
+
+---
+
+### Task 14: Test fixtures
+
+**Files:**
+- Modify: `tests/common/mod.rs`
+
+- [ ] **Step 1: Add diff test fixtures**
+
+Append to `tests/common/mod.rs`:
+
+```rust
+/// Create a pair of files for positional diff testing.
+/// File A: Alice/90, Bob/80, Charlie/70
+/// File B: Alice/90, Charlie/70, Dana/85 (Bob removed, Dana added)
+pub fn create_diff_pair(path_a: &Path, path_b: &Path) {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "Name").unwrap();
+ ws.write_string(0, 1, "Score").unwrap();
+ ws.write_string(1, 0, "Alice").unwrap();
+ ws.write_number(1, 1, 90.0).unwrap();
+ ws.write_string(2, 0, "Bob").unwrap();
+ ws.write_number(2, 1, 80.0).unwrap();
+ ws.write_string(3, 0, "Charlie").unwrap();
+ ws.write_number(3, 1, 70.0).unwrap();
+ wb.save(path_a).unwrap();
+
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "Name").unwrap();
+ ws.write_string(0, 1, "Score").unwrap();
+ ws.write_string(1, 0, "Alice").unwrap();
+ ws.write_number(1, 1, 90.0).unwrap();
+ ws.write_string(2, 0, "Charlie").unwrap();
+ ws.write_number(2, 1, 70.0).unwrap();
+ ws.write_string(3, 0, "Dana").unwrap();
+ ws.write_number(3, 1, 85.0).unwrap();
+ wb.save(path_b).unwrap();
+}
+
+/// Create a pair for key-based diff testing.
+/// File A: ID=1/Score=90, ID=2/Score=80, ID=3/Score=70
+/// File B: ID=1/Score=95, ID=2/Score=80, ID=4/Score=85
+/// Expected: ID=1 modified (90→95), ID=3 removed, ID=4 added
+pub fn create_diff_pair_with_keys(path_a: &Path, path_b: &Path) {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "ID").unwrap();
+ ws.write_string(0, 1, "Name").unwrap();
+ ws.write_string(0, 2, "Score").unwrap();
+ ws.write_string(1, 0, "1").unwrap();
+ ws.write_string(1, 1, "Alice").unwrap();
+ ws.write_number(1, 2, 90.0).unwrap();
+ ws.write_string(2, 0, "2").unwrap();
+ ws.write_string(2, 1, "Bob").unwrap();
+ ws.write_number(2, 2, 80.0).unwrap();
+ ws.write_string(3, 0, "3").unwrap();
+ ws.write_string(3, 1, "Charlie").unwrap();
+ ws.write_number(3, 2, 70.0).unwrap();
+ wb.save(path_a).unwrap();
+
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "ID").unwrap();
+ ws.write_string(0, 1, "Name").unwrap();
+ ws.write_string(0, 2, "Score").unwrap();
+ ws.write_string(1, 0, "1").unwrap();
+ ws.write_string(1, 1, "Alice").unwrap();
+ ws.write_number(1, 2, 95.0).unwrap();
+ ws.write_string(2, 0, "2").unwrap();
+ ws.write_string(2, 1, "Bob").unwrap();
+ ws.write_number(2, 2, 80.0).unwrap();
+ ws.write_string(3, 0, "4").unwrap();
+ ws.write_string(3, 1, "Dana").unwrap();
+ ws.write_number(3, 2, 85.0).unwrap();
+ wb.save(path_b).unwrap();
+}
+
+/// Create a pair for tolerance testing.
+/// File A: ID=1/Price=100.001, ID=2/Price=200.5
+/// File B: ID=1/Price=100.002, ID=2/Price=200.6
+/// With tolerance 0.01: only ID=2 should be modified (diff=0.1 > 0.01)
+pub fn create_diff_pair_with_floats(path_a: &Path, path_b: &Path) {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "ID").unwrap();
+ ws.write_string(0, 1, "Price").unwrap();
+ ws.write_string(1, 0, "1").unwrap();
+ ws.write_number(1, 1, 100.001).unwrap();
+ ws.write_string(2, 0, "2").unwrap();
+ ws.write_number(2, 1, 200.5).unwrap();
+ wb.save(path_a).unwrap();
+
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "ID").unwrap();
+ ws.write_string(0, 1, "Price").unwrap();
+ ws.write_string(1, 0, "1").unwrap();
+ ws.write_number(1, 1, 100.002).unwrap();
+ ws.write_string(2, 0, "2").unwrap();
+ ws.write_number(2, 1, 200.6).unwrap();
+ wb.save(path_b).unwrap();
+}
+```
+
+- [ ] **Step 2: Verify compilation**
+
+Run: `cargo test --no-run`
+Expected: compiles successfully
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add tests/common/mod.rs
+git commit -m "test(xldiff): add diff test fixtures — positional, keyed, and float tolerance pairs"
+```
+
+---
+
+### Task 15: Integration tests
+
+**Files:**
+- Create: `tests/test_xldiff.rs`
+
+- [ ] **Step 1: Write integration tests**
+
+Create `tests/test_xldiff.rs`:
+
+```rust
+mod common;
+
+use assert_cmd::Command;
+use predicates::prelude::*;
+use tempfile::TempDir;
+
+#[test]
+fn test_no_diff() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ // Use same data for both
+ common::create_simple(&path_a);
+ common::create_simple(&path_b);
+
+ Command::cargo_bin("xldiff").unwrap()
+ .arg(path_a.to_str().unwrap())
+ .arg(path_b.to_str().unwrap())
+ .arg("--no-color")
+ .assert()
+ .success() // exit 0
+ .stdout(predicate::str::contains("No differences found."));
+}
+
+#[test]
+fn test_positional_diff() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair(&path_a, &path_b);
+
+ Command::cargo_bin("xldiff").unwrap()
+ .arg(path_a.to_str().unwrap())
+ .arg(path_b.to_str().unwrap())
+ .arg("--no-color")
+ .assert()
+ .code(1) // differences found
+ .stdout(predicate::str::contains("Removed: 1"))
+ .stdout(predicate::str::contains("Added: 1"))
+ .stdout(predicate::str::contains("Bob"));
+}
+
+#[test]
+fn test_keyed_diff() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ Command::cargo_bin("xldiff").unwrap()
+ .arg(path_a.to_str().unwrap())
+ .arg(path_b.to_str().unwrap())
+ .arg("--key").arg("ID")
+ .arg("--no-color")
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("Modified: 1"))
+ .stdout(predicate::str::contains("Removed: 1"))
+ .stdout(predicate::str::contains("Added: 1"));
+}
+
+#[test]
+fn test_tolerance() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_floats(&path_a, &path_b);
+
+ // With tolerance=0.01, ID=1 diff (0.001) within tolerance, ID=2 diff (0.1) exceeds
+ Command::cargo_bin("xldiff").unwrap()
+ .arg(path_a.to_str().unwrap())
+ .arg(path_b.to_str().unwrap())
+ .arg("--key").arg("ID")
+ .arg("--tolerance").arg("0.01")
+ .arg("--no-color")
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("Modified: 1"));
+}
+
+#[test]
+fn test_json_format() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ Command::cargo_bin("xldiff").unwrap()
+ .arg(path_a.to_str().unwrap())
+ .arg(path_b.to_str().unwrap())
+ .arg("--key").arg("ID")
+ .arg("--format").arg("json")
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("\"added\""))
+ .stdout(predicate::str::contains("\"removed\""))
+ .stdout(predicate::str::contains("\"modified\""));
+}
+
+#[test]
+fn test_markdown_format() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ Command::cargo_bin("xldiff").unwrap()
+ .arg(path_a.to_str().unwrap())
+ .arg(path_b.to_str().unwrap())
+ .arg("--key").arg("ID")
+ .arg("--format").arg("markdown")
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("## Added"))
+ .stdout(predicate::str::contains("## Removed"))
+ .stdout(predicate::str::contains("## Modified"));
+}
+
+#[test]
+fn test_csv_format() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ Command::cargo_bin("xldiff").unwrap()
+ .arg(path_a.to_str().unwrap())
+ .arg(path_b.to_str().unwrap())
+ .arg("--key").arg("ID")
+ .arg("--format").arg("csv")
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("_status"))
+ .stdout(predicate::str::contains("added"))
+ .stdout(predicate::str::contains("removed"))
+ .stdout(predicate::str::contains("modified"));
+}
+
+#[test]
+fn test_file_not_found() {
+ Command::cargo_bin("xldiff").unwrap()
+ .arg("nonexistent.xlsx")
+ .arg("also_nonexistent.xlsx")
+ .assert()
+ .code(2)
+ .stderr(predicate::str::contains("file not found"));
+}
+
+#[test]
+fn test_sheet_selector() {
+ let dir = TempDir::new().unwrap();
+ let path = dir.path().join("multi.xlsx");
+ common::create_multi_sheet(&path);
+ let path_str = path.to_str().unwrap();
+
+ // Compare Revenue vs Expenses sheets within same file
+ Command::cargo_bin("xldiff").unwrap()
+ .arg(format!("{}:Revenue", path_str))
+ .arg(format!("{}:Expenses", path_str))
+ .arg("--no-color")
+ .assert()
+ .code(1); // different data → exit 1
+}
+
+#[test]
+fn test_cols_filter() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ // Only compare Name column (ignore Score), so ID=1 shows no modification
+ Command::cargo_bin("xldiff").unwrap()
+ .arg(path_a.to_str().unwrap())
+ .arg(path_b.to_str().unwrap())
+ .arg("--key").arg("ID")
+ .arg("--cols").arg("ID,Name")
+ .arg("--no-color")
+ .assert()
+ .code(1)
+ // ID=1 has same Name in both, so no modification for that row
+ .stdout(predicate::str::contains("Modified: 0").or(predicate::str::contains("Modified").not()));
+}
+```
+
+- [ ] **Step 2: Run integration tests**
+
+Run: `cargo test --test test_xldiff`
+Expected: all tests PASS
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add tests/test_xldiff.rs
+git commit -m "test(xldiff): add integration tests — all modes, formats, tolerance, errors"
+```
+
+---
+
+### Task 16: Update CLAUDE.md and README
+
+**Files:**
+- Modify: `CLAUDE.md`
+
+- [ ] **Step 1: Update CLAUDE.md**
+
+Update the "Current tools" section to include xldiff, remove it from "Related tools" and "Planned":
+
+In CLAUDE.md, update:
+- Add `- **xldiff** — compare two sheets, report added/removed/modified rows` to "Current tools"
+- Remove or update the "Related tools" xldiff entry
+- Remove xlfilter from "Planned" if still there (it's already built)
+- Update the Architecture line to mention the diff module
+
+- [ ] **Step 2: Commit**
+
+```bash
+git add CLAUDE.md
+git commit -m "docs: update CLAUDE.md — add xldiff to current tools, remove from planned"
+```
+
+---
+
+### Task 17: Final verification
+
+- [ ] **Step 1: Run all tests**
+
+Run: `cargo test`
+Expected: all tests pass (existing + new)
+
+- [ ] **Step 2: Run clippy**
+
+Run: `cargo clippy --all-targets`
+Expected: no warnings
+
+- [ ] **Step 3: Build release binary**
+
+Run: `cargo build --release --bin xldiff`
+Expected: binary created at `target/release/xldiff`
+
+- [ ] **Step 4: Manual smoke test**
+
+Run: `cargo run --bin xldiff -- demo/budget.xlsx demo/sales.xlsx --no-color`
+Expected: shows differences between the two demo files
+
+- [ ] **Step 5: Commit any fixes**
+
+If clippy or tests revealed issues, fix and commit.
diff --git a/docs/superpowers/specs/2026-03-17-xldiff-design.md b/docs/superpowers/specs/2026-03-17-xldiff-design.md
@@ -0,0 +1,248 @@
+# xldiff — Excel Sheet Comparison Tool
+
+Port of [go-xldiff](../../../go-xldiff) to Rust, leveraging polars for type-aware comparison.
+
+## CLI Interface
+
+```
+xldiff <FILE1>[:<SHEET>] <FILE2>[:<SHEET>] [flags]
+```
+
+Two positional file arguments, each optionally suffixed with `:SheetName` or `:0` (0-based index). The same file may appear twice with different sheets (e.g., `report.xlsx:Q1 report.xlsx:Q2`).
+
+Sheet selection uses `rfind(':')` to split on the last colon, so Windows drive-letter paths (e.g., `C:\file.xlsx`) are handled correctly. The suffix is tried as a sheet name first, then as a 0-based numeric index.
+
+### Flags
+
+| Flag | Type | Default | Description |
+|------|------|---------|-------------|
+| `--key <COLS>` | string | *(none)* | Comma-separated key columns (header name or column letter). Omit for positional mode. |
+| `--cols <COLS>` | string | *(all)* | Columns to compare (header name or column letter). Key columns always included. |
+| `--skip <N>[,<M>]` | string | `0` | Rows to skip before header. Single value applies to both files; comma pair sets each independently. |
+| `--no-header` | bool | false | Treat first row as data; generate synthetic `column_0`, `column_1`, … headers (matching reader.rs convention). |
+| `--tolerance <F>` | f64 | *(none)* | Numeric tolerance for float comparisons (e.g., `0.01`). |
+| `--format <FMT>` | string | `text` | Output format: `text`, `markdown`, `json`, `csv`. |
+| `--no-color` | bool | false | Force-disable ANSI colors (auto-detected by default). |
+
+**Column resolution priority** (consistent with `filter.rs::resolve_column()`):
+1. Exact header name match
+2. Case-insensitive header name match
+3. Column letter (A, B, AA) if purely alphabetic
+
+This means `--key A` matches a header named "A" before column position 0. To select the first column by position, use `--cols` with letter "A" only when no header is named "A".
+
+### Exit Codes
+
+xldiff follows the `diff(1)` convention, which differs from xlcat/xlset/xlfilter:
+
+- **0** — no differences
+- **1** — differences found
+- **2** — error (invalid arguments or runtime failure)
+
+The other tools in this repo use exit 1 for runtime errors and exit 2 for argument errors. xldiff merges both error types into exit 2 because exit 1 is reserved for "differences found" — the standard convention for diff tools, enabling `if ! xldiff ...; then ...` in scripts.
+
+## Data Structures
+
+```rust
+pub struct DiffResult {
+ pub headers: Vec<String>, // Columns that were compared (filtered set if --cols used)
+ pub key_columns: Vec<String>,
+ pub added: Vec<DiffRow>,
+ pub removed: Vec<DiffRow>,
+ pub modified: Vec<ModifiedRow>,
+ pub source_a: SheetSource,
+ pub source_b: SheetSource,
+}
+
+pub struct SheetSource {
+ pub file_name: String,
+ pub sheet_name: String,
+}
+
+pub struct DiffRow {
+ pub values: Vec<String>, // Cell values as display strings
+}
+
+pub struct ModifiedRow {
+ pub key: Vec<String>,
+ pub changes: Vec<CellChange>,
+}
+
+pub struct CellChange {
+ pub column: String,
+ pub old_value: String,
+ pub new_value: String,
+}
+```
+
+`DiffResult` is format-agnostic. Each output formatter consumes it independently. `SheetSource` carries file/sheet metadata for headers.
+
+Values are converted to display strings during diff construction. Type-aware comparison (tolerance, numeric equality) happens during the diff phase; the result stores only the string representations.
+
+`headers` reflects the effective column set after `--cols` filtering, so output formatters use it directly without further filtering.
+
+## Diff Algorithm
+
+### Mismatched Headers
+
+Before diffing, the tool checks whether both DataFrames share the same column names (after `--cols` filtering). Behavior:
+
+- **Key-based mode:** key columns must exist in both files or the tool exits with an error. Non-key columns are compared by name — columns present in only one file are reported as added/removed columns in a warning on stderr, and comparison proceeds on the intersection. `DiffResult.headers` contains the union.
+- **Positional mode:** columns are matched by position. If column counts differ, the tool warns on stderr and pads the shorter side with empty values.
+
+### Positional Mode (no `--key`)
+
+1. Convert each row to a string: join all compared columns with a null-byte separator.
+2. Build frequency maps (`HashMap<RowHash, usize>`) for both tables.
+3. Walk table A: if B has remaining copies, consume one; otherwise mark as removed.
+4. Walk table B: if A has remaining copies, consume one; otherwise mark as added.
+5. No "modified" rows — any cell change produces a different hash.
+
+### Key-Based Mode (`--key` provided)
+
+The implementation uses HashMap-based row matching (like the Go version) rather than polars joins. This avoids complications with join column type mismatches, row reordering, and duplicate-key cartesian products. Polars is used for reading and column selection, not for the join itself.
+
+1. Resolve key column names/letters to column indices.
+2. Build `HashMap<Vec<String>, RowData>` for both tables, keyed by the concatenated key column values.
+3. For each key in A not in B → removed.
+4. For each key in B not in A → added.
+5. For each key in both → compare non-key columns cell by cell:
+ - **Numeric (Float64):** if `--tolerance` is set, equal when `|a - b| <= tolerance`; otherwise exact. NaN == NaN is treated as equal; NaN vs non-NaN is a difference.
+ - **Integer vs Float cross-type:** if one file infers a column as Int64 and the other as Float64, both values are compared as f64. Tolerance applies if set.
+ - **Integer (Int64):** exact comparison (tolerance does not apply to pure integer columns).
+ - **String:** exact string comparison.
+ - **Datetime:** compare as timestamps.
+ - **Boolean:** exact match.
+ - **Null vs value:** treated as different. Null vs null: equal.
+6. Rows with any cell difference become a `ModifiedRow` with a list of `CellChange`s.
+7. Duplicate keys produce a warning on stderr. Last occurrence wins (same as Go).
+
+### Column Filtering (`--cols`)
+
+Applied before diffing. Both DataFrames are narrowed to the selected columns. Key columns are always retained, even when omitted from `--cols`.
+
+### Tolerance
+
+- Applies to Float64 columns and Int64-vs-Float64 cross-type comparisons in key-based mode.
+- Positional mode hashes exact string values; tolerance does not apply.
+- If `--tolerance` is set but no numeric columns exist, it is a silent no-op.
+
+## Output Formats
+
+### No Differences
+
+When the diff result is empty (no added, removed, or modified rows):
+
+- **Text:** prints "No differences found." to stdout.
+- **Markdown:** prints "No differences found."
+- **JSON:** `{"added":[],"removed":[],"modified":[]}`
+- **CSV:** header row only (no data rows).
+
+All formats exit 0.
+
+### Text (default, colored)
+
+```
+--- Sheet1 (old.xlsx)
++++ Sheet1 (new.xlsx)
+
+Added: 1 | Removed: 1 | Modified: 1
+
+- ID: "3" Name: "Charlie" Score: "70"
++ ID: "4" Name: "Dana" Score: "85"
+
+~ ID: "1"
+ Score: "90" → "95"
+```
+
+ANSI colors: red (`-`), green (`+`), yellow (`~`). Auto-detects TTY via `std::io::IsTerminal`; `--no-color` overrides.
+
+### Markdown (`--format markdown`)
+
+```markdown
+## Added (1)
+| ID | Name | Score |
+|----|------|-------|
+| 4 | Dana | 85 |
+
+## Removed (1)
+| ID | Name | Score |
+|----|------|-------|
+| 3 | Charlie | 70 |
+
+## Modified (1)
+| Key (ID) | Column | Old | New |
+|----------|--------|-----|-----|
+| 1 | Score | 90 | 95 |
+```
+
+Clean, uncolored tables. The added/removed tables can reuse `formatter.rs::render_table_header()` and `render_table_rows()` for consistent alignment with xlcat/xlfilter output. The modified table is diff-specific.
+
+### JSON (`--format json`)
+
+```json
+{
+ "added": [{"ID": "4", "Name": "Dana", "Score": "85"}],
+ "removed": [{"ID": "3", "Name": "Charlie", "Score": "70"}],
+ "modified": [
+ {
+ "key": {"ID": "1"},
+ "changes": [{"column": "Score", "old": "90", "new": "95"}]
+ }
+ ]
+}
+```
+
+Empty arrays are `[]`, never null. Pretty-printed with 2-space indentation.
+
+### CSV (`--format csv`)
+
+```csv
+_status,ID,Name,Score,_old_ID,_old_Name,_old_Score
+added,4,Dana,85,,,
+removed,3,Charlie,70,,,
+modified,1,,95,,,90
+```
+
+`_status` column indicates row type. `_old_*` columns hold previous values. For modified rows: new values go in the main columns, old values in `_old_*` columns, only for changed cells. Unchanged cells are empty in both positions. Key column values appear in the main columns to identify the row.
+
+## Module Layout
+
+### New Files
+
+- **`src/diff.rs`** — Core diff logic: `DiffResult`, positional/key-based algorithms, tolerance, duplicate key detection.
+- **`src/bin/xldiff.rs`** — CLI binary: clap argument parsing, sheet resolution, output formatting (text, markdown, json, csv).
+
+### Reused Modules
+
+- **`reader.rs`** — `read_sheet()` / `read_sheet_with_skip()` to load DataFrames.
+- **`metadata.rs`** — `read_file_info()` for sheet resolution.
+- **`cell.rs`** — Column letter resolution (shared with filter.rs via `col_letter_to_index()`).
+- **`formatter.rs`** — `render_table_header()` and `render_table_rows()` for markdown added/removed tables.
+
+### Output Formatting
+
+The four format renderers live in the binary (`xldiff.rs`), not in `formatter.rs`. Diff output structure differs enough from xlcat's table rendering to warrant separation. The markdown formatter reuses `formatter.rs` table-rendering helpers for the added/removed sections.
+
+### Cargo.toml Changes
+
+- Add `[[bin]] name = "xldiff"` entry.
+- Add `serde = { version = "1", features = ["derive"] }` and `serde_json = "1"` for JSON output.
+- TTY detection via `std::io::IsTerminal` (stable since Rust 1.70, no extra crate needed).
+
+### Pipeline
+
+```
+parse args → resolve sheets → read DataFrames
+ → narrow to --cols → diff (positional or key-based)
+ → format output → print → exit code
+```
+
+## Testing
+
+- **Unit tests** in `diff.rs`: positional mode, key-based mode, tolerance (including cross-type Int64/Float64, NaN handling), duplicate keys, column filtering, edge cases (empty sheets, single-row, mismatched columns/headers).
+- **Integration tests** in `tests/test_xldiff.rs`: CLI end-to-end with `assert_cmd`.
+- **Fixtures**: add helpers to `tests/common/mod.rs`:
+ - `create_diff_pair()` — two files with added/removed/identical rows for positional mode.
+ - `create_diff_pair_with_keys()` — two files with key columns and modified cells.
+ - `create_diff_pair_with_floats()` — two files with float values differing by small amounts for tolerance testing.
diff --git a/src/bin/xldiff.rs b/src/bin/xldiff.rs
@@ -0,0 +1,826 @@
+use std::io::IsTerminal;
+use std::path::PathBuf;
+use std::process;
+
+use anyhow::{Result, bail};
+use clap::Parser;
+use serde_json::{Map, Value, json};
+
+use xlcat::diff::{DiffOptions, DiffResult, SheetSource};
+use xlcat::filter;
+use xlcat::formatter;
+use xlcat::metadata;
+use xlcat::reader;
+
+#[derive(Parser)]
+#[command(
+ name = "xldiff",
+ about = "Compare two Excel spreadsheets and show differences",
+ version
+)]
+struct Args {
+ /// First file (optionally file.xlsx:SheetName)
+ file_a: String,
+
+ /// Second file (optionally file.xlsx:SheetName)
+ file_b: String,
+
+ /// Key column(s) for matching rows (comma-separated names or letters)
+ #[arg(long)]
+ key: Option<String>,
+
+ /// Columns to compare (comma-separated names or letters)
+ #[arg(long)]
+ cols: Option<String>,
+
+ /// Rows to skip before header: single number or "skipA,skipB"
+ #[arg(long, default_value = "0")]
+ skip: String,
+
+ /// Numeric tolerance for float comparisons
+ #[arg(long)]
+ tolerance: Option<f64>,
+
+ /// Output format: text, markdown, json, csv
+ #[arg(long, default_value = "text")]
+ format: String,
+
+ /// Disable colored output
+ #[arg(long)]
+ no_color: bool,
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/// Split a CLI argument like "file.xlsx:Sheet1" into (path, optional sheet).
+///
+/// Handles Windows drive letters (e.g. C:\file.xlsx) by ignoring a colon
+/// at position 1 when followed by `\` or `/`. A trailing colon with nothing
+/// after it is treated as no sheet specification.
+fn parse_file_arg(arg: &str) -> (PathBuf, Option<String>) {
+ // Find the *last* colon — that's the sheet separator.
+ if let Some(pos) = arg.rfind(':') {
+ // Skip Windows drive letters: colon at position 1 followed by \ or /
+ if pos == 1 {
+ let after = arg.as_bytes().get(2);
+ if after == Some(&b'\\') || after == Some(&b'/') {
+ return (PathBuf::from(arg), None);
+ }
+ }
+ let sheet_part = &arg[pos + 1..];
+ if sheet_part.is_empty() {
+ // Trailing colon — ignore it
+ return (PathBuf::from(&arg[..pos]), None);
+ }
+ (PathBuf::from(&arg[..pos]), Some(sheet_part.to_string()))
+ } else {
+ (PathBuf::from(arg), None)
+ }
+}
+
+/// Parse the --skip flag: either "3" (same skip for both) or "3,5".
+fn parse_skip(s: &str) -> Result<(usize, usize)> {
+ if let Some((a, b)) = s.split_once(',') {
+ let skip_a: usize = a
+ .trim()
+ .parse()
+ .map_err(|_| anyhow::anyhow!("invalid skip value: '{}'", a.trim()))?;
+ let skip_b: usize = b
+ .trim()
+ .parse()
+ .map_err(|_| anyhow::anyhow!("invalid skip value: '{}'", b.trim()))?;
+ Ok((skip_a, skip_b))
+ } else {
+ let skip: usize = s
+ .trim()
+ .parse()
+ .map_err(|_| anyhow::anyhow!("invalid skip value: '{}'", s.trim()))?;
+ Ok((skip, skip))
+ }
+}
+
+/// Resolve a sheet name: exact match, then 0-based index, then error.
+fn resolve_sheet(
+ info: &metadata::FileInfo,
+ sheet_arg: Option<&str>,
+) -> Result<String> {
+ match sheet_arg {
+ None => info
+ .sheets
+ .first()
+ .map(|s| s.name.clone())
+ .ok_or_else(|| anyhow::anyhow!("workbook has no sheets")),
+ Some(s) => {
+ // Exact name match
+ if let Some(sheet) = info.sheets.iter().find(|si| si.name == s) {
+ return Ok(sheet.name.clone());
+ }
+ // 0-based index
+ if let Ok(idx) = s.parse::<usize>()
+ && let Some(sheet) = info.sheets.get(idx)
+ {
+ return Ok(sheet.name.clone());
+ }
+ let names: Vec<_> = info.sheets.iter().map(|si| si.name.as_str()).collect();
+ bail!(
+ "sheet '{}' not found. Available sheets: {}",
+ s,
+ names.join(", ")
+ )
+ }
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Output formatters
+// ---------------------------------------------------------------------------
+
+/// Format a row's values inline: `Name: "Alice" Score: "90"`
+fn format_row_inline(headers: &[String], values: &[String]) -> String {
+ headers
+ .iter()
+ .zip(values.iter())
+ .map(|(h, v)| format!("{}: \"{}\"", h, v))
+ .collect::<Vec<_>>()
+ .join(" ")
+}
+
+/// Format diff result as colored (or plain) text output.
+fn format_text(result: &DiffResult, color: bool) -> String {
+ if !result.has_differences() {
+ return "No differences found.\n".to_string();
+ }
+
+ let (red, green, yellow, reset) = if color {
+ ("\x1b[31m", "\x1b[32m", "\x1b[33m", "\x1b[0m")
+ } else {
+ ("", "", "", "")
+ };
+
+ let mut out = String::new();
+
+ // Header
+ out.push_str(&format!(
+ "--- {} ({})\n+++ {} ({})\n\n",
+ result.source_a.sheet_name,
+ result.source_a.file_name,
+ result.source_b.sheet_name,
+ result.source_b.file_name,
+ ));
+
+ // Summary
+ out.push_str(&format!(
+ "Added: {} | Removed: {} | Modified: {}\n\n",
+ result.added.len(),
+ result.removed.len(),
+ result.modified.len(),
+ ));
+
+ // Removed rows
+ for row in &result.removed {
+ out.push_str(&format!(
+ "{}- {}{}",
+ red,
+ format_row_inline(&result.headers, &row.values),
+ reset,
+ ));
+ out.push('\n');
+ }
+
+ // Added rows
+ for row in &result.added {
+ out.push_str(&format!(
+ "{}+ {}{}",
+ green,
+ format_row_inline(&result.headers, &row.values),
+ reset,
+ ));
+ out.push('\n');
+ }
+
+ // Modified rows
+ for m in &result.modified {
+ // Build key display: Key: "value" Key2: "value2"
+ let key_display: Vec<String> = result
+ .key_columns
+ .iter()
+ .zip(m.key.iter())
+ .map(|(col, val)| format!("{}: \"{}\"", col, val))
+ .collect();
+ out.push_str(&format!(
+ "{}~ {}{}",
+ yellow,
+ key_display.join(" "),
+ reset,
+ ));
+ out.push('\n');
+ for change in &m.changes {
+ out.push_str(&format!(
+ " {}: \"{}\" \u{2192} \"{}\"\n",
+ change.column, change.old_value, change.new_value,
+ ));
+ }
+ }
+
+ out
+}
+
+/// Format diff result as markdown.
+fn format_markdown(result: &DiffResult) -> String {
+ if !result.has_differences() {
+ return "No differences found.\n".to_string();
+ }
+
+ let mut out = String::new();
+
+ // Added
+ if !result.added.is_empty() {
+ out.push_str(&format!("## Added ({})\n\n", result.added.len()));
+ let rows: Vec<Vec<String>> = result
+ .added
+ .iter()
+ .map(|r| r.values.clone())
+ .collect();
+ out.push_str(&formatter::render_table(&result.headers, &rows));
+ out.push('\n');
+ }
+
+ // Removed
+ if !result.removed.is_empty() {
+ out.push_str(&format!("## Removed ({})\n\n", result.removed.len()));
+ let rows: Vec<Vec<String>> = result
+ .removed
+ .iter()
+ .map(|r| r.values.clone())
+ .collect();
+ out.push_str(&formatter::render_table(&result.headers, &rows));
+ out.push('\n');
+ }
+
+ // Modified
+ if !result.modified.is_empty() {
+ out.push_str(&format!("## Modified ({})\n\n", result.modified.len()));
+
+ let key_label = if result.key_columns.len() == 1 {
+ format!("Key ({})", result.key_columns[0])
+ } else {
+ format!("Key ({})", result.key_columns.join(", "))
+ };
+ let headers = vec![
+ key_label,
+ "Column".to_string(),
+ "Old".to_string(),
+ "New".to_string(),
+ ];
+ let mut rows: Vec<Vec<String>> = Vec::new();
+
+ for m in &result.modified {
+ let key_display = m.key.join(", ");
+
+ for (i, change) in m.changes.iter().enumerate() {
+ let key_cell = if i == 0 {
+ key_display.clone()
+ } else {
+ String::new()
+ };
+ rows.push(vec![
+ key_cell,
+ change.column.clone(),
+ change.old_value.clone(),
+ change.new_value.clone(),
+ ]);
+ }
+ }
+
+ out.push_str(&formatter::render_table(&headers, &rows));
+ out.push('\n');
+ }
+
+ out
+}
+
+/// Format diff result as JSON.
+fn format_json(result: &DiffResult) -> String {
+ let added: Vec<Value> = result
+ .added
+ .iter()
+ .map(|row| {
+ let mut map = Map::new();
+ for (h, v) in result.headers.iter().zip(row.values.iter()) {
+ map.insert(h.clone(), Value::String(v.clone()));
+ }
+ Value::Object(map)
+ })
+ .collect();
+
+ let removed: Vec<Value> = result
+ .removed
+ .iter()
+ .map(|row| {
+ let mut map = Map::new();
+ for (h, v) in result.headers.iter().zip(row.values.iter()) {
+ map.insert(h.clone(), Value::String(v.clone()));
+ }
+ Value::Object(map)
+ })
+ .collect();
+
+ let modified: Vec<Value> = result
+ .modified
+ .iter()
+ .map(|m| {
+ let mut key_map = Map::new();
+ for (col, val) in result.key_columns.iter().zip(m.key.iter()) {
+ key_map.insert(col.clone(), Value::String(val.clone()));
+ }
+ let changes: Vec<Value> = m
+ .changes
+ .iter()
+ .map(|c| {
+ json!({
+ "column": c.column,
+ "old": c.old_value,
+ "new": c.new_value,
+ })
+ })
+ .collect();
+ json!({
+ "key": Value::Object(key_map),
+ "changes": changes,
+ })
+ })
+ .collect();
+
+ let output = json!({
+ "added": added,
+ "removed": removed,
+ "modified": modified,
+ });
+
+ serde_json::to_string_pretty(&output).unwrap() + "\n"
+}
+
+/// Quote a value per RFC 4180: if it contains comma, quote, or newline, wrap
+/// in double quotes and escape any internal quotes by doubling them.
+fn csv_quote(value: &str) -> String {
+ if value.contains(',') || value.contains('"') || value.contains('\n') {
+ format!("\"{}\"", value.replace('"', "\"\""))
+ } else {
+ value.to_string()
+ }
+}
+
+/// Build a CSV row from a slice of values.
+fn csv_row(values: &[String]) -> String {
+ values.iter().map(|v| csv_quote(v)).collect::<Vec<_>>().join(",")
+}
+
+/// Format diff result as CSV.
+///
+/// Header: _status, col1, col2, ..., _old_col1, _old_col2, ...
+/// Added rows: "added" + values + empty _old_ columns
+/// Removed rows: "removed" + values + empty _old_ columns
+/// Modified rows: "modified" + new values (key cols + changed new values) + old values in _old_ columns
+fn format_csv(result: &DiffResult) -> String {
+ let mut out = String::new();
+
+ // Build header
+ let mut header_parts: Vec<String> = vec!["_status".to_string()];
+ for h in &result.headers {
+ header_parts.push(h.clone());
+ }
+ for h in &result.headers {
+ header_parts.push(format!("_old_{}", h));
+ }
+ out.push_str(&csv_row(&header_parts));
+ out.push('\n');
+
+ let empty_cols: Vec<String> = result.headers.iter().map(|_| String::new()).collect();
+
+ // Removed rows
+ for row in &result.removed {
+ let mut parts: Vec<String> = vec!["removed".to_string()];
+ parts.extend(row.values.iter().cloned());
+ // Pad if row has fewer values than headers
+ while parts.len() < 1 + result.headers.len() {
+ parts.push(String::new());
+ }
+ parts.extend(empty_cols.iter().cloned());
+ out.push_str(&csv_row(&parts));
+ out.push('\n');
+ }
+
+ // Added rows
+ for row in &result.added {
+ let mut parts: Vec<String> = vec!["added".to_string()];
+ parts.extend(row.values.iter().cloned());
+ while parts.len() < 1 + result.headers.len() {
+ parts.push(String::new());
+ }
+ parts.extend(empty_cols.iter().cloned());
+ out.push_str(&csv_row(&parts));
+ out.push('\n');
+ }
+
+ // Modified rows
+ for m in &result.modified {
+ let mut main_cols: Vec<String> = Vec::new();
+ let mut old_cols: Vec<String> = Vec::new();
+
+ for h in &result.headers {
+ // Check if this is a key column
+ if let Some(key_idx) = result.key_columns.iter().position(|k| k == h) {
+ main_cols.push(m.key.get(key_idx).cloned().unwrap_or_default());
+ old_cols.push(String::new());
+ } else if let Some(change) = m.changes.iter().find(|c| c.column == *h) {
+ main_cols.push(change.new_value.clone());
+ old_cols.push(change.old_value.clone());
+ } else {
+ // Unchanged non-key column — leave empty in both
+ main_cols.push(String::new());
+ old_cols.push(String::new());
+ }
+ }
+
+ let mut parts: Vec<String> = vec!["modified".to_string()];
+ parts.extend(main_cols);
+ parts.extend(old_cols);
+ out.push_str(&csv_row(&parts));
+ out.push('\n');
+ }
+
+ out
+}
+
+// ---------------------------------------------------------------------------
+// run / main
+// ---------------------------------------------------------------------------
+
+fn run(args: Args) -> Result<()> {
+ // Parse file arguments
+ let (path_a, sheet_arg_a) = parse_file_arg(&args.file_a);
+ let (path_b, sheet_arg_b) = parse_file_arg(&args.file_b);
+
+ // Validate files exist
+ if !path_a.exists() {
+ bail!("file not found: {}", path_a.display());
+ }
+ if !path_b.exists() {
+ bail!("file not found: {}", path_b.display());
+ }
+
+ // Validate format
+ let format = args.format.to_lowercase();
+ if !["text", "markdown", "json", "csv"].contains(&format.as_str()) {
+ bail!(
+ "unknown format '{}'. Use: text, markdown, json, csv",
+ args.format
+ );
+ }
+
+ // Parse skip
+ let (skip_a, skip_b) = parse_skip(&args.skip)?;
+
+ // Read file info and resolve sheets
+ let info_a = metadata::read_file_info(&path_a)?;
+ let info_b = metadata::read_file_info(&path_b)?;
+
+ let sheet_a = resolve_sheet(&info_a, sheet_arg_a.as_deref())?;
+ let sheet_b = resolve_sheet(&info_b, sheet_arg_b.as_deref())?;
+
+ // Read DataFrames
+ let df_a = if skip_a > 0 {
+ reader::read_sheet_with_skip(&path_a, &sheet_a, skip_a)?
+ } else {
+ reader::read_sheet(&path_a, &sheet_a)?
+ };
+ let df_b = if skip_b > 0 {
+ reader::read_sheet_with_skip(&path_b, &sheet_b, skip_b)?
+ } else {
+ reader::read_sheet(&path_b, &sheet_b)?
+ };
+
+ // Resolve key columns
+ let key_columns: Vec<String> = if let Some(ref key_str) = args.key {
+ let specs: Vec<String> = key_str.split(',').map(|s| s.trim().to_string()).collect();
+
+ let df_a_cols: Vec<String> = df_a
+ .get_column_names()
+ .iter()
+ .map(|s| s.to_string())
+ .collect();
+ let df_b_cols: Vec<String> = df_b
+ .get_column_names()
+ .iter()
+ .map(|s| s.to_string())
+ .collect();
+
+ let resolved_a = filter::resolve_columns(&specs, &df_a_cols)
+ .map_err(|e| anyhow::anyhow!("first file: {}", e))?;
+ let resolved_b = filter::resolve_columns(&specs, &df_b_cols)
+ .map_err(|e| anyhow::anyhow!("second file: {}", e))?;
+
+ // Validate keys match between both files
+ if resolved_a != resolved_b {
+ bail!(
+ "key columns resolve to different names: {:?} vs {:?}",
+ resolved_a,
+ resolved_b
+ );
+ }
+ resolved_a
+ } else {
+ vec![]
+ };
+
+ // Column filtering with --cols
+ let (df_a, df_b) = if let Some(ref cols_str) = args.cols {
+ let specs: Vec<String> = cols_str.split(',').map(|s| s.trim().to_string()).collect();
+
+ let df_a_cols: Vec<String> = df_a
+ .get_column_names()
+ .iter()
+ .map(|s| s.to_string())
+ .collect();
+ let df_b_cols: Vec<String> = df_b
+ .get_column_names()
+ .iter()
+ .map(|s| s.to_string())
+ .collect();
+
+ let mut selected_a = filter::resolve_columns(&specs, &df_a_cols)
+ .map_err(|e| anyhow::anyhow!(e))?;
+ let mut selected_b = filter::resolve_columns(&specs, &df_b_cols)
+ .map_err(|e| anyhow::anyhow!(e))?;
+
+ // Ensure key columns are included
+ for key in &key_columns {
+ if !selected_a.contains(key) {
+ selected_a.insert(0, key.clone());
+ }
+ if !selected_b.contains(key) {
+ selected_b.insert(0, key.clone());
+ }
+ }
+
+ let df_a = df_a.select(selected_a.iter().map(|s| s.as_str()))?;
+ let df_b = df_b.select(selected_b.iter().map(|s| s.as_str()))?;
+ (df_a, df_b)
+ } else {
+ (df_a, df_b)
+ };
+
+ // Build sources and options
+ let file_name_a = path_a
+ .file_name()
+ .map(|s| s.to_string_lossy().to_string())
+ .unwrap_or_else(|| args.file_a.clone());
+ let file_name_b = path_b
+ .file_name()
+ .map(|s| s.to_string_lossy().to_string())
+ .unwrap_or_else(|| args.file_b.clone());
+
+ let source_a = SheetSource {
+ file_name: file_name_a,
+ sheet_name: sheet_a,
+ };
+ let source_b = SheetSource {
+ file_name: file_name_b,
+ sheet_name: sheet_b,
+ };
+
+ let opts = DiffOptions {
+ key_columns,
+ tolerance: args.tolerance,
+ };
+
+ // Run diff
+ let result = xlcat::diff::diff_sheets(&df_a, &df_b, &opts, source_a, source_b)?;
+
+ // TTY detection for color
+ let use_color = !args.no_color && std::io::stdout().is_terminal();
+
+ // Format output
+ let output = match format.as_str() {
+ "text" => format_text(&result, use_color),
+ "markdown" => format_markdown(&result),
+ "json" => format_json(&result),
+ "csv" => format_csv(&result),
+ _ => unreachable!(),
+ };
+
+ print!("{}", output);
+
+ // Exit 1 if differences found (diff convention)
+ if result.has_differences() {
+ process::exit(1);
+ }
+
+ Ok(())
+}
+
+fn main() {
+ let args = Args::parse();
+ if let Err(err) = run(args) {
+ eprintln!("xldiff: {err}");
+ process::exit(2);
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use xlcat::diff::{CellChange, DiffRow, ModifiedRow};
+
+ // -- parse_file_arg --
+
+ #[test]
+ fn test_parse_file_arg_no_sheet() {
+ let (path, sheet) = parse_file_arg("data.xlsx");
+ assert_eq!(path, PathBuf::from("data.xlsx"));
+ assert_eq!(sheet, None);
+ }
+
+ #[test]
+ fn test_parse_file_arg_with_sheet() {
+ let (path, sheet) = parse_file_arg("data.xlsx:Revenue");
+ assert_eq!(path, PathBuf::from("data.xlsx"));
+ assert_eq!(sheet, Some("Revenue".to_string()));
+ }
+
+ #[test]
+ fn test_parse_file_arg_trailing_colon() {
+ let (path, sheet) = parse_file_arg("data.xlsx:");
+ assert_eq!(path, PathBuf::from("data.xlsx"));
+ assert_eq!(sheet, None);
+ }
+
+ #[test]
+ fn test_parse_file_arg_windows_drive() {
+ let (path, sheet) = parse_file_arg("C:\\Users\\file.xlsx");
+ assert_eq!(path, PathBuf::from("C:\\Users\\file.xlsx"));
+ assert_eq!(sheet, None);
+ }
+
+ #[test]
+ fn test_parse_file_arg_windows_drive_with_sheet() {
+ let (path, sheet) = parse_file_arg("C:\\Users\\file.xlsx:Sheet2");
+ assert_eq!(path, PathBuf::from("C:\\Users\\file.xlsx"));
+ assert_eq!(sheet, Some("Sheet2".to_string()));
+ }
+
+ // -- parse_skip --
+
+ #[test]
+ fn test_parse_skip_single() {
+ assert_eq!(parse_skip("3").unwrap(), (3, 3));
+ }
+
+ #[test]
+ fn test_parse_skip_pair() {
+ assert_eq!(parse_skip("3,5").unwrap(), (3, 5));
+ }
+
+ #[test]
+ fn test_parse_skip_zero() {
+ assert_eq!(parse_skip("0").unwrap(), (0, 0));
+ }
+
+ #[test]
+ fn test_parse_skip_invalid() {
+ assert!(parse_skip("abc").is_err());
+ }
+
+ // -- csv_quote --
+
+ #[test]
+ fn test_csv_quote_plain() {
+ assert_eq!(csv_quote("hello"), "hello");
+ }
+
+ #[test]
+ fn test_csv_quote_comma() {
+ assert_eq!(csv_quote("a,b"), "\"a,b\"");
+ }
+
+ #[test]
+ fn test_csv_quote_quotes() {
+ assert_eq!(csv_quote("say \"hi\""), "\"say \"\"hi\"\"\"");
+ }
+
+ // -- format_text --
+
+ #[test]
+ fn test_format_text_no_diff() {
+ let result = DiffResult {
+ headers: vec!["a".into()],
+ key_columns: vec![],
+ added: vec![],
+ removed: vec![],
+ modified: vec![],
+ source_a: SheetSource {
+ file_name: "a.xlsx".into(),
+ sheet_name: "Sheet1".into(),
+ },
+ source_b: SheetSource {
+ file_name: "b.xlsx".into(),
+ sheet_name: "Sheet1".into(),
+ },
+ };
+ assert_eq!(format_text(&result, false), "No differences found.\n");
+ }
+
+ #[test]
+ fn test_format_text_with_changes() {
+ let result = DiffResult {
+ headers: vec!["id".into(), "name".into()],
+ key_columns: vec!["id".into()],
+ added: vec![DiffRow {
+ values: vec!["3".into(), "Charlie".into()],
+ }],
+ removed: vec![DiffRow {
+ values: vec!["1".into(), "Alice".into()],
+ }],
+ modified: vec![ModifiedRow {
+ key: vec!["2".into()],
+ changes: vec![CellChange {
+ column: "name".into(),
+ old_value: "Bob".into(),
+ new_value: "Robert".into(),
+ }],
+ }],
+ source_a: SheetSource {
+ file_name: "a.xlsx".into(),
+ sheet_name: "Sheet1".into(),
+ },
+ source_b: SheetSource {
+ file_name: "b.xlsx".into(),
+ sheet_name: "Sheet1".into(),
+ },
+ };
+ let text = format_text(&result, false);
+ assert!(text.contains("--- Sheet1 (a.xlsx)"));
+ assert!(text.contains("+++ Sheet1 (b.xlsx)"));
+ assert!(text.contains("Added: 1"));
+ assert!(text.contains("Removed: 1"));
+ assert!(text.contains("Modified: 1"));
+ assert!(text.contains("- id: \"1\" name: \"Alice\""));
+ assert!(text.contains("+ id: \"3\" name: \"Charlie\""));
+ assert!(text.contains("~ id: \"2\""));
+ assert!(text.contains("name: \"Bob\" \u{2192} \"Robert\""));
+ }
+
+ // -- format_json --
+
+ #[test]
+ fn test_format_json_structure() {
+ let result = DiffResult {
+ headers: vec!["id".into(), "val".into()],
+ key_columns: vec!["id".into()],
+ added: vec![DiffRow {
+ values: vec!["2".into(), "new".into()],
+ }],
+ removed: vec![],
+ modified: vec![],
+ source_a: SheetSource {
+ file_name: "a.xlsx".into(),
+ sheet_name: "S1".into(),
+ },
+ source_b: SheetSource {
+ file_name: "b.xlsx".into(),
+ sheet_name: "S1".into(),
+ },
+ };
+ let json_str = format_json(&result);
+ let parsed: Value = serde_json::from_str(&json_str).unwrap();
+ assert_eq!(parsed["added"][0]["id"], "2");
+ assert_eq!(parsed["added"][0]["val"], "new");
+ assert!(parsed["removed"].as_array().unwrap().is_empty());
+ }
+
+ // -- format_csv --
+
+ #[test]
+ fn test_format_csv_header() {
+ let result = DiffResult {
+ headers: vec!["id".into(), "name".into()],
+ key_columns: vec!["id".into()],
+ added: vec![],
+ removed: vec![],
+ modified: vec![],
+ source_a: SheetSource {
+ file_name: "a.xlsx".into(),
+ sheet_name: "S".into(),
+ },
+ source_b: SheetSource {
+ file_name: "b.xlsx".into(),
+ sheet_name: "S".into(),
+ },
+ };
+ let csv = format_csv(&result);
+ let first_line = csv.lines().next().unwrap();
+ assert_eq!(first_line, "_status,id,name,_old_id,_old_name");
+ }
+}
diff --git a/src/bin/xlfilter.rs b/src/bin/xlfilter.rs
@@ -0,0 +1,207 @@
+use std::path::PathBuf;
+use std::process;
+
+use anyhow::Result;
+use clap::Parser;
+
+use xlcat::filter::{
+ parse_filter_expr, parse_sort_spec, filter_pipeline, FilterOptions,
+};
+use xlcat::formatter;
+use xlcat::metadata;
+use xlcat::metadata::SheetInfo;
+use xlcat::reader;
+
+#[derive(Parser)]
+#[command(
+ name = "xlfilter",
+ about = "Filter and query Excel spreadsheet data",
+ version
+)]
+struct Args {
+ /// Path to .xls or .xlsx file
+ file: PathBuf,
+
+ /// Select columns by letter (A,B,D) or header name (State,Amount)
+ #[arg(long)]
+ cols: Option<String>,
+
+ /// Filter rows. Multiple = AND. Operators: = != > < >= <= ~ (contains, case-insensitive) !~ (not contains)
+ #[arg(long = "where")]
+ filters: Vec<String>,
+
+ /// Sort by column. Format: col:dir (dir = asc or desc, default asc)
+ #[arg(long)]
+ sort: Option<String>,
+
+ /// Max rows in output (applied after filtering)
+ #[arg(long)]
+ limit: Option<usize>,
+
+ /// First N rows (applied before filtering)
+ #[arg(long)]
+ head: Option<usize>,
+
+ /// Last N rows (applied before filtering)
+ #[arg(long)]
+ tail: Option<usize>,
+
+ /// Target sheet by name or 0-based index (default: first sheet)
+ #[arg(long)]
+ sheet: Option<String>,
+
+ /// Output as CSV instead of markdown table
+ #[arg(long)]
+ csv: bool,
+
+ /// Skip first N data rows (for metadata/title rows above the real header)
+ #[arg(long)]
+ skip: Option<usize>,
+}
+
+// ---------------------------------------------------------------------------
+// ArgError — used for user-facing flag/argument errors (exit code 2)
+// ---------------------------------------------------------------------------
+
+#[derive(Debug)]
+struct ArgError(String);
+
+impl std::fmt::Display for ArgError {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ write!(f, "{}", self.0)
+ }
+}
+
+impl std::error::Error for ArgError {}
+
+fn run(args: Args) -> Result<()> {
+ // Validate file exists
+ if !args.file.exists() {
+ anyhow::bail!(ArgError(format!(
+ "file not found: {}",
+ args.file.display()
+ )));
+ }
+
+ // Validate mutually exclusive flags
+ if args.head.is_some() && args.tail.is_some() {
+ anyhow::bail!(ArgError(
+ "--head and --tail are mutually exclusive".to_string()
+ ));
+ }
+
+ // Read file metadata to resolve sheet name
+ let info = metadata::read_file_info(&args.file)?;
+ let sheet_name = resolve_sheet(&info, args.sheet.as_deref())?;
+
+ // Read sheet into DataFrame
+ let df = if let Some(skip) = args.skip {
+ reader::read_sheet_with_skip(&args.file, &sheet_name, skip)?
+ } else {
+ reader::read_sheet(&args.file, &sheet_name)?
+ };
+
+ if df.height() == 0 {
+ eprintln!("0 rows");
+ let sheet_info = info
+ .sheets
+ .iter()
+ .find(|s| s.name == sheet_name)
+ .cloned()
+ .unwrap_or(SheetInfo {
+ name: sheet_name,
+ rows: 0,
+ cols: 0,
+ });
+ println!("{}", formatter::format_empty_sheet(&sheet_info));
+ return Ok(());
+ }
+
+ // Parse filter expressions
+ let filters: Vec<_> = args
+ .filters
+ .iter()
+ .map(|s| parse_filter_expr(s))
+ .collect::<Result<Vec<_>, _>>()
+ .map_err(|e| anyhow::anyhow!(ArgError(e)))?;
+
+ // Parse sort spec
+ let sort = args
+ .sort
+ .as_deref()
+ .map(parse_sort_spec)
+ .transpose()
+ .map_err(|e| anyhow::anyhow!(ArgError(e)))?;
+
+ // Parse column selection
+ let cols = args.cols.map(|s| {
+ s.split(',')
+ .map(|c| c.trim().to_string())
+ .collect::<Vec<_>>()
+ });
+
+ // Run pipeline
+ let opts = FilterOptions {
+ filters,
+ cols,
+ sort,
+ limit: args.limit,
+ head: args.head,
+ tail: args.tail,
+ };
+ let result = filter_pipeline(df, &opts)?;
+
+ // Output row count to stderr
+ eprintln!("{} rows", result.height());
+
+ // Format output
+ if result.height() == 0 {
+ println!("{}", formatter::format_data_table(&result));
+ } else if args.csv {
+ print!("{}", formatter::format_csv(&result));
+ } else {
+ println!("{}", formatter::format_data_table(&result));
+ }
+
+ Ok(())
+}
+
+/// Resolve sheet name from --sheet flag or default to first sheet.
+fn resolve_sheet(info: &metadata::FileInfo, sheet_arg: Option<&str>) -> Result<String> {
+ match sheet_arg {
+ None => {
+ info.sheets
+ .first()
+ .map(|s| s.name.clone())
+ .ok_or_else(|| anyhow::anyhow!("workbook has no sheets"))
+ }
+ Some(s) => {
+ if let Some(sheet) = info.sheets.iter().find(|si| si.name == s) {
+ return Ok(sheet.name.clone());
+ }
+ if let Ok(idx) = s.parse::<usize>() {
+ if let Some(sheet) = info.sheets.get(idx) {
+ return Ok(sheet.name.clone());
+ }
+ }
+ let names: Vec<_> = info.sheets.iter().map(|s| s.name.as_str()).collect();
+ anyhow::bail!(ArgError(format!(
+ "sheet '{}' not found. Available sheets: {}",
+ s,
+ names.join(", ")
+ )))
+ }
+ }
+}
+
+fn main() {
+ let args = Args::parse();
+ if let Err(err) = run(args) {
+ if err.downcast_ref::<ArgError>().is_some() {
+ eprintln!("xlfilter: {err}");
+ process::exit(2);
+ }
+ eprintln!("xlfilter: {err}");
+ process::exit(1);
+ }
+}
diff --git a/src/diff.rs b/src/diff.rs
@@ -0,0 +1,830 @@
+// Diff engine for comparing two Excel sheets.
+
+use anyhow::{Result, bail};
+use polars::prelude::*;
+use std::collections::HashMap;
+
+use crate::formatter;
+
+/// Source file and sheet metadata for display.
+#[derive(Debug, Clone)]
+pub struct SheetSource {
+ pub file_name: String,
+ pub sheet_name: String,
+}
+
+/// A single row from an added or removed set.
+#[derive(Debug, Clone)]
+pub struct DiffRow {
+ pub values: Vec<String>,
+}
+
+/// A change in a single cell.
+#[derive(Debug, Clone)]
+pub struct CellChange {
+ pub column: String,
+ pub old_value: String,
+ pub new_value: String,
+}
+
+/// A row present in both files with cell-level differences.
+#[derive(Debug, Clone)]
+pub struct ModifiedRow {
+ pub key: Vec<String>,
+ pub changes: Vec<CellChange>,
+}
+
+/// Result of comparing two sheets.
+#[derive(Debug, Clone)]
+pub struct DiffResult {
+ pub headers: Vec<String>,
+ pub key_columns: Vec<String>,
+ pub added: Vec<DiffRow>,
+ pub removed: Vec<DiffRow>,
+ pub modified: Vec<ModifiedRow>,
+ pub source_a: SheetSource,
+ pub source_b: SheetSource,
+}
+
+impl DiffResult {
+ pub fn has_differences(&self) -> bool {
+ !self.added.is_empty() || !self.removed.is_empty() || !self.modified.is_empty()
+ }
+}
+
+/// Options controlling how the diff is performed.
+#[derive(Debug, Clone, Default)]
+pub struct DiffOptions {
+ pub key_columns: Vec<String>,
+ pub tolerance: Option<f64>,
+}
+
+// ---------------------------------------------------------------------------
+// Helper functions
+// ---------------------------------------------------------------------------
+
+/// Format a cell value for display. Returns empty string for null.
+fn cell_to_string(col: &Column, idx: usize) -> String {
+ match col.get(idx) {
+ Ok(AnyValue::Null) | Err(_) => String::new(),
+ Ok(v) => formatter::format_any_value(&v),
+ }
+}
+
+/// Format a cell value for hashing. Uses a sentinel for null so that null
+/// and empty string produce different keys.
+fn cell_to_key_part(col: &Column, idx: usize) -> String {
+ match col.get(idx) {
+ Ok(AnyValue::Null) | Err(_) => "\x01NULL\x01".to_string(),
+ Ok(v) => formatter::format_any_value(&v),
+ }
+}
+
+/// Build a string key for an entire row by joining all column values.
+fn row_to_key(df: &DataFrame, row_idx: usize) -> String {
+ df.get_columns()
+ .iter()
+ .map(|col| cell_to_key_part(col, row_idx))
+ .collect::<Vec<_>>()
+ .join("\0")
+}
+
+/// Collect display values for every column in a row.
+fn row_to_strings(df: &DataFrame, row_idx: usize) -> Vec<String> {
+ df.get_columns()
+ .iter()
+ .map(|col| cell_to_string(col, row_idx))
+ .collect()
+}
+
+// ---------------------------------------------------------------------------
+// Positional diff
+// ---------------------------------------------------------------------------
+
+/// Compare two DataFrames positionally (no key columns).
+///
+/// Uses multiset comparison: each unique row is tracked by frequency.
+/// Rows present in A but not (or fewer times) in B are "removed";
+/// rows present in B but not (or fewer times) in A are "added".
+pub fn diff_positional(
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ _opts: &DiffOptions,
+ source_a: SheetSource,
+ source_b: SheetSource,
+) -> Result<DiffResult> {
+ // Determine headers — use the longer header set.
+ let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect();
+ let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect();
+
+ let headers = if headers_b.len() > headers_a.len() {
+ if headers_a.len() != headers_b.len() {
+ eprintln!(
+ "Warning: column count differs ({} vs {}), using wider header set",
+ headers_a.len(),
+ headers_b.len()
+ );
+ }
+ headers_b.clone()
+ } else {
+ if headers_a.len() != headers_b.len() {
+ eprintln!(
+ "Warning: column count differs ({} vs {}), using wider header set",
+ headers_a.len(),
+ headers_b.len()
+ );
+ }
+ headers_a.clone()
+ };
+
+ let num_headers = headers.len();
+
+ // Build frequency maps: key → list of row indices (so we can consume them).
+ let mut freq_a: HashMap<String, Vec<usize>> = HashMap::new();
+ for i in 0..df_a.height() {
+ let key = row_to_key(df_a, i);
+ freq_a.entry(key).or_default().push(i);
+ }
+
+ let mut freq_b: HashMap<String, Vec<usize>> = HashMap::new();
+ for i in 0..df_b.height() {
+ let key = row_to_key(df_b, i);
+ freq_b.entry(key).or_default().push(i);
+ }
+
+ let mut removed = Vec::new();
+ let mut added = Vec::new();
+
+ // Walk A: for each row, try to consume a matching row from B.
+ for i in 0..df_a.height() {
+ let key = row_to_key(df_a, i);
+ let consumed = freq_b
+ .get_mut(&key)
+ .and_then(|indices| indices.pop())
+ .is_some();
+ if !consumed {
+ let mut vals = row_to_strings(df_a, i);
+ vals.resize(num_headers, String::new());
+ removed.push(DiffRow { values: vals });
+ }
+ }
+
+ // Walk B: for each row, try to consume a matching row from A.
+ for i in 0..df_b.height() {
+ let key = row_to_key(df_b, i);
+ let consumed = freq_a
+ .get_mut(&key)
+ .and_then(|indices| indices.pop())
+ .is_some();
+ if !consumed {
+ let mut vals = row_to_strings(df_b, i);
+ vals.resize(num_headers, String::new());
+ added.push(DiffRow { values: vals });
+ }
+ }
+
+ Ok(DiffResult {
+ headers,
+ key_columns: vec![],
+ added,
+ removed,
+ modified: vec![],
+ source_a,
+ source_b,
+ })
+}
+
+// ---------------------------------------------------------------------------
+// Key-based diff
+// ---------------------------------------------------------------------------
+
+/// A row indexed by its key columns.
+struct KeyedRow {
+ values: Vec<String>,
+ key_values: Vec<String>,
+}
+
+/// Build a map from composite key string to KeyedRow for every row in the DataFrame.
+fn build_key_map(
+ df: &DataFrame,
+ key_indices: &[usize],
+ columns: &[Column],
+) -> HashMap<String, KeyedRow> {
+ let mut map = HashMap::new();
+ for i in 0..df.height() {
+ let key_values: Vec<String> = key_indices
+ .iter()
+ .map(|&ki| cell_to_string(&columns[ki], i))
+ .collect();
+ let composite_key = key_values.join("\0");
+ let values: Vec<String> = columns.iter().map(|col| cell_to_string(col, i)).collect();
+ map.insert(
+ composite_key,
+ KeyedRow {
+ values,
+ key_values,
+ },
+ );
+ }
+ map
+}
+
+/// Warn on stderr when duplicate keys are found.
+fn check_duplicate_keys(
+ df: &DataFrame,
+ key_indices: &[usize],
+ columns: &[Column],
+ source: &SheetSource,
+) {
+ let mut seen: HashMap<String, usize> = HashMap::new();
+ for i in 0..df.height() {
+ let key: String = key_indices
+ .iter()
+ .map(|&ki| cell_to_string(&columns[ki], i))
+ .collect::<Vec<_>>()
+ .join("\0");
+ let count = seen.entry(key.clone()).or_insert(0);
+ *count += 1;
+ if *count == 2 {
+ let display_key = key.replace('\0', ", ");
+ eprintln!(
+ "Warning: duplicate key [{}] in {}:{}",
+ display_key, source.file_name, source.sheet_name
+ );
+ }
+ }
+}
+
+/// Check whether a polars DataType is a float type.
+fn is_float_dtype(dt: &DataType) -> bool {
+ matches!(dt, DataType::Float32 | DataType::Float64)
+}
+
+/// Check whether a polars DataType is an integer type.
+fn is_int_dtype(dt: &DataType) -> bool {
+ matches!(
+ dt,
+ DataType::Int8
+ | DataType::Int16
+ | DataType::Int32
+ | DataType::Int64
+ | DataType::UInt8
+ | DataType::UInt16
+ | DataType::UInt32
+ | DataType::UInt64
+ )
+}
+
+/// Compare two string-rendered values with optional numeric tolerance.
+///
+/// Rules:
+/// - NaN == NaN is true.
+/// - NaN vs non-NaN is false.
+/// - Pure int+int columns use exact comparison (no tolerance applied).
+/// - At least one float column applies tolerance.
+/// - Otherwise exact string comparison.
+fn values_equal_with_tolerance(
+ val_a: &str,
+ val_b: &str,
+ tolerance: f64,
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ col_name: &str,
+) -> bool {
+ let parsed_a = val_a.parse::<f64>();
+ let parsed_b = val_b.parse::<f64>();
+
+ match (parsed_a, parsed_b) {
+ (Ok(a), Ok(b)) => {
+ if a.is_nan() && b.is_nan() {
+ return true;
+ }
+ if a.is_nan() || b.is_nan() {
+ return false;
+ }
+
+ let dt_a = df_a
+ .column(col_name)
+ .map(|c| c.dtype().clone())
+ .unwrap_or(DataType::String);
+ let dt_b = df_b
+ .column(col_name)
+ .map(|c| c.dtype().clone())
+ .unwrap_or(DataType::String);
+
+ if is_int_dtype(&dt_a) && is_int_dtype(&dt_b) {
+ val_a == val_b
+ } else if is_float_dtype(&dt_a) || is_float_dtype(&dt_b) {
+ (a - b).abs() <= tolerance
+ } else {
+ val_a == val_b
+ }
+ }
+ _ => val_a == val_b,
+ }
+}
+
+/// Compare non-key columns of two keyed rows and return cell-level changes.
+#[allow(clippy::too_many_arguments)]
+fn compare_rows(
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ headers_a: &[String],
+ headers_b: &[String],
+ row_a: &KeyedRow,
+ row_b: &KeyedRow,
+ common_columns: &[String],
+ opts: &DiffOptions,
+) -> Vec<CellChange> {
+ let mut changes = Vec::new();
+ for col_name in common_columns {
+ let idx_a = headers_a.iter().position(|h| h == col_name);
+ let idx_b = headers_b.iter().position(|h| h == col_name);
+ let val_a = idx_a
+ .map(|i| row_a.values.get(i).cloned().unwrap_or_default())
+ .unwrap_or_default();
+ let val_b = idx_b
+ .map(|i| row_b.values.get(i).cloned().unwrap_or_default())
+ .unwrap_or_default();
+
+ let equal = if let Some(tol) = opts.tolerance {
+ values_equal_with_tolerance(&val_a, &val_b, tol, df_a, df_b, col_name)
+ } else {
+ val_a == val_b
+ };
+
+ if !equal {
+ changes.push(CellChange {
+ column: col_name.clone(),
+ old_value: val_a,
+ new_value: val_b,
+ });
+ }
+ }
+ changes
+}
+
+/// Compare two DataFrames using key columns.
+pub fn diff_keyed(
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ opts: &DiffOptions,
+ source_a: SheetSource,
+ source_b: SheetSource,
+) -> Result<DiffResult> {
+ let columns_a = df_a.get_columns();
+ let columns_b = df_b.get_columns();
+ let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect();
+ let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect();
+
+ // Resolve key column indices in both frames.
+ let mut key_indices_a = Vec::new();
+ let mut key_indices_b = Vec::new();
+ for key_col in &opts.key_columns {
+ match headers_a.iter().position(|h| h == key_col) {
+ Some(idx) => key_indices_a.push(idx),
+ None => bail!("Key column '{}' not found in {}", key_col, source_a.file_name),
+ }
+ match headers_b.iter().position(|h| h == key_col) {
+ Some(idx) => key_indices_b.push(idx),
+ None => bail!("Key column '{}' not found in {}", key_col, source_b.file_name),
+ }
+ }
+
+ // Find non-key columns.
+ let non_key_a: Vec<String> = headers_a
+ .iter()
+ .filter(|h| !opts.key_columns.contains(h))
+ .cloned()
+ .collect();
+ let non_key_b: Vec<String> = headers_b
+ .iter()
+ .filter(|h| !opts.key_columns.contains(h))
+ .cloned()
+ .collect();
+
+ // Common non-key columns (for modification detection).
+ let common_columns: Vec<String> = non_key_a
+ .iter()
+ .filter(|h| non_key_b.contains(h))
+ .cloned()
+ .collect();
+
+ // Warn about columns only in one file.
+ for col in &non_key_a {
+ if !non_key_b.contains(col) {
+ eprintln!("Warning: column '{}' only in {}", col, source_a.file_name);
+ }
+ }
+ for col in &non_key_b {
+ if !non_key_a.contains(col) {
+ eprintln!("Warning: column '{}' only in {}", col, source_b.file_name);
+ }
+ }
+
+ // Build output headers: key columns + all from A non-key + B-only non-key.
+ let mut headers = opts.key_columns.clone();
+ headers.extend(non_key_a.iter().cloned());
+ for col in &non_key_b {
+ if !non_key_a.contains(col) {
+ headers.push(col.clone());
+ }
+ }
+
+ // Check for duplicate keys.
+ check_duplicate_keys(df_a, &key_indices_a, columns_a, &source_a);
+ check_duplicate_keys(df_b, &key_indices_b, columns_b, &source_b);
+
+ // Build key maps.
+ let map_a = build_key_map(df_a, &key_indices_a, columns_a);
+ let map_b = build_key_map(df_b, &key_indices_b, columns_b);
+
+ let mut removed = Vec::new();
+ let mut added = Vec::new();
+ let mut modified = Vec::new();
+
+ // Keys in A but not in B → removed.
+ for (composite_key, row_a) in &map_a {
+ if !map_b.contains_key(composite_key) {
+ let mut vals = Vec::new();
+ for h in &headers {
+ if let Some(idx) = headers_a.iter().position(|ha| ha == h) {
+ vals.push(row_a.values.get(idx).cloned().unwrap_or_default());
+ } else {
+ vals.push(String::new());
+ }
+ }
+ removed.push(DiffRow { values: vals });
+ }
+ }
+
+ // Keys in B but not in A → added.
+ for (composite_key, row_b) in &map_b {
+ if !map_a.contains_key(composite_key) {
+ let mut vals = Vec::new();
+ for h in &headers {
+ if let Some(idx) = headers_b.iter().position(|hb| hb == h) {
+ vals.push(row_b.values.get(idx).cloned().unwrap_or_default());
+ } else {
+ vals.push(String::new());
+ }
+ }
+ added.push(DiffRow { values: vals });
+ }
+ }
+
+ // Keys in both → compare for modifications.
+ for (composite_key, row_a) in &map_a {
+ if let Some(row_b) = map_b.get(composite_key) {
+ let changes = compare_rows(
+ df_a,
+ df_b,
+ &headers_a,
+ &headers_b,
+ row_a,
+ row_b,
+ &common_columns,
+ opts,
+ );
+ if !changes.is_empty() {
+ modified.push(ModifiedRow {
+ key: row_a.key_values.clone(),
+ changes,
+ });
+ }
+ }
+ }
+
+ Ok(DiffResult {
+ headers,
+ key_columns: opts.key_columns.clone(),
+ added,
+ removed,
+ modified,
+ source_a,
+ source_b,
+ })
+}
+
+// ---------------------------------------------------------------------------
+// Entry point
+// ---------------------------------------------------------------------------
+
+/// Compare two DataFrames, dispatching to positional or key-based diff
+/// depending on whether key columns are specified.
+pub fn diff_sheets(
+ df_a: &DataFrame,
+ df_b: &DataFrame,
+ opts: &DiffOptions,
+ source_a: SheetSource,
+ source_b: SheetSource,
+) -> Result<DiffResult> {
+ if opts.key_columns.is_empty() {
+ diff_positional(df_a, df_b, opts, source_a, source_b)
+ } else {
+ diff_keyed(df_a, df_b, opts, source_a, source_b)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn test_source_a() -> SheetSource {
+ SheetSource {
+ file_name: "a.xlsx".into(),
+ sheet_name: "Sheet1".into(),
+ }
+ }
+
+ fn test_source_b() -> SheetSource {
+ SheetSource {
+ file_name: "b.xlsx".into(),
+ sheet_name: "Sheet1".into(),
+ }
+ }
+
+ // ---- Positional diff tests ----
+
+ #[test]
+ fn test_positional_no_diff() {
+ let df_a = df! {
+ "name" => &["Alice", "Bob"],
+ "score" => &[100, 200],
+ }
+ .unwrap();
+ let df_b = df_a.clone();
+ let opts = DiffOptions::default();
+
+ let result = diff_positional(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert!(!result.has_differences());
+ assert!(result.added.is_empty());
+ assert!(result.removed.is_empty());
+ assert!(result.modified.is_empty());
+ }
+
+ #[test]
+ fn test_positional_added_removed() {
+ let df_a = df! {
+ "name" => &["Alice", "Bob"],
+ }
+ .unwrap();
+ let df_b = df! {
+ "name" => &["Alice", "Charlie"],
+ }
+ .unwrap();
+ let opts = DiffOptions::default();
+
+ let result = diff_positional(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert!(result.has_differences());
+ assert_eq!(result.removed.len(), 1);
+ assert_eq!(result.removed[0].values, vec!["Bob"]);
+ assert_eq!(result.added.len(), 1);
+ assert_eq!(result.added[0].values, vec!["Charlie"]);
+ }
+
+ #[test]
+ fn test_positional_duplicate_rows() {
+ let df_a = df! {
+ "val" => &["A", "A", "A"],
+ }
+ .unwrap();
+ let df_b = df! {
+ "val" => &["A", "A"],
+ }
+ .unwrap();
+ let opts = DiffOptions::default();
+
+ let result = diff_positional(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert_eq!(result.removed.len(), 1);
+ assert_eq!(result.removed[0].values, vec!["A"]);
+ assert!(result.added.is_empty());
+ }
+
+ // ---- Key-based diff tests ----
+
+ #[test]
+ fn test_keyed_no_diff() {
+ let df_a = df! {
+ "id" => &[1, 2],
+ "name" => &["Alice", "Bob"],
+ }
+ .unwrap();
+ let df_b = df_a.clone();
+ let opts = DiffOptions {
+ key_columns: vec!["id".into()],
+ tolerance: None,
+ };
+
+ let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert!(!result.has_differences());
+ }
+
+ #[test]
+ fn test_keyed_added_removed() {
+ let df_a = df! {
+ "id" => &[1, 2],
+ "name" => &["Alice", "Bob"],
+ }
+ .unwrap();
+ let df_b = df! {
+ "id" => &[2, 3],
+ "name" => &["Bob", "Charlie"],
+ }
+ .unwrap();
+ let opts = DiffOptions {
+ key_columns: vec!["id".into()],
+ tolerance: None,
+ };
+
+ let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert_eq!(result.removed.len(), 1);
+ assert!(result.removed[0].values.contains(&"1".to_string()));
+ assert!(result.removed[0].values.contains(&"Alice".to_string()));
+
+ assert_eq!(result.added.len(), 1);
+ assert!(result.added[0].values.contains(&"3".to_string()));
+ assert!(result.added[0].values.contains(&"Charlie".to_string()));
+ }
+
+ #[test]
+ fn test_keyed_modified() {
+ let df_a = df! {
+ "id" => &[1, 2],
+ "score" => &[100, 200],
+ }
+ .unwrap();
+ let df_b = df! {
+ "id" => &[1, 2],
+ "score" => &[100, 250],
+ }
+ .unwrap();
+ let opts = DiffOptions {
+ key_columns: vec!["id".into()],
+ tolerance: None,
+ };
+
+ let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert!(result.added.is_empty());
+ assert!(result.removed.is_empty());
+ assert_eq!(result.modified.len(), 1);
+
+ let m = &result.modified[0];
+ assert_eq!(m.key, vec!["2"]);
+ assert_eq!(m.changes.len(), 1);
+ assert_eq!(m.changes[0].column, "score");
+ assert_eq!(m.changes[0].old_value, "200");
+ assert_eq!(m.changes[0].new_value, "250");
+ }
+
+ #[test]
+ fn test_keyed_composite_key() {
+ let df_a = df! {
+ "date" => &["2024-01-01", "2024-01-01", "2024-01-02"],
+ "ticker" => &["AAPL", "GOOG", "AAPL"],
+ "price" => &[150.0, 140.0, 151.0],
+ }
+ .unwrap();
+ let df_b = df! {
+ "date" => &["2024-01-01", "2024-01-01", "2024-01-02"],
+ "ticker" => &["AAPL", "GOOG", "AAPL"],
+ "price" => &[150.0, 142.0, 151.0],
+ }
+ .unwrap();
+ let opts = DiffOptions {
+ key_columns: vec!["date".into(), "ticker".into()],
+ tolerance: None,
+ };
+
+ let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert!(result.added.is_empty());
+ assert!(result.removed.is_empty());
+ assert_eq!(result.modified.len(), 1);
+
+ let m = &result.modified[0];
+ assert_eq!(m.key, vec!["2024-01-01", "GOOG"]);
+ assert_eq!(m.changes[0].column, "price");
+ assert_eq!(m.changes[0].old_value, "140");
+ assert_eq!(m.changes[0].new_value, "142");
+ }
+
+ // ---- Tolerance tests ----
+
+ #[test]
+ fn test_keyed_tolerance_within() {
+ let df_a = df! {
+ "id" => &[1],
+ "price" => &[100.001_f64],
+ }
+ .unwrap();
+ let df_b = df! {
+ "id" => &[1],
+ "price" => &[100.002_f64],
+ }
+ .unwrap();
+ let opts = DiffOptions {
+ key_columns: vec!["id".into()],
+ tolerance: Some(0.01),
+ };
+
+ let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert!(!result.has_differences());
+ }
+
+ #[test]
+ fn test_keyed_tolerance_exceeded() {
+ let df_a = df! {
+ "id" => &[1],
+ "price" => &[100.0_f64],
+ }
+ .unwrap();
+ let df_b = df! {
+ "id" => &[1],
+ "price" => &[100.05_f64],
+ }
+ .unwrap();
+ let opts = DiffOptions {
+ key_columns: vec!["id".into()],
+ tolerance: Some(0.01),
+ };
+
+ let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert_eq!(result.modified.len(), 1);
+ assert_eq!(result.modified[0].changes[0].column, "price");
+ }
+
+ #[test]
+ fn test_keyed_nan_handling() {
+ let df_a = df! {
+ "id" => &[1],
+ "value" => &[f64::NAN],
+ }
+ .unwrap();
+ let df_b = df! {
+ "id" => &[1],
+ "value" => &[f64::NAN],
+ }
+ .unwrap();
+ let opts = DiffOptions {
+ key_columns: vec!["id".into()],
+ tolerance: Some(0.01),
+ };
+
+ let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert!(!result.has_differences(), "NaN vs NaN should be treated as equal");
+ }
+
+ // ---- diff_sheets entry point tests ----
+
+ #[test]
+ fn test_diff_sheets_positional() {
+ let df_a = df! {
+ "name" => &["Alice", "Bob"],
+ }
+ .unwrap();
+ let df_b = df! {
+ "name" => &["Alice", "Charlie"],
+ }
+ .unwrap();
+ let opts = DiffOptions::default(); // No key columns → positional.
+
+ let result = diff_sheets(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert!(result.key_columns.is_empty());
+ assert_eq!(result.removed.len(), 1);
+ assert_eq!(result.added.len(), 1);
+ }
+
+ #[test]
+ fn test_diff_sheets_keyed() {
+ let df_a = df! {
+ "id" => &[1, 2],
+ "score" => &[100, 200],
+ }
+ .unwrap();
+ let df_b = df! {
+ "id" => &[1, 2],
+ "score" => &[100, 250],
+ }
+ .unwrap();
+ let opts = DiffOptions {
+ key_columns: vec!["id".into()],
+ tolerance: None,
+ };
+
+ let result = diff_sheets(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap();
+
+ assert_eq!(result.key_columns, vec!["id"]);
+ assert_eq!(result.modified.len(), 1);
+ }
+}
diff --git a/src/filter.rs b/src/filter.rs
@@ -0,0 +1,736 @@
+use anyhow::Result;
+use polars::prelude::*;
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum FilterOp {
+ Eq,
+ NotEq,
+ Gt,
+ Lt,
+ Gte,
+ Lte,
+ Contains,
+ NotContains,
+}
+
+#[derive(Debug, Clone)]
+pub struct FilterExpr {
+ pub column: String,
+ pub op: FilterOp,
+ pub value: String,
+}
+
+#[derive(Debug, Clone)]
+pub struct SortSpec {
+ pub column: String,
+ pub descending: bool,
+}
+
+/// Parse a filter expression like "State=CA", "Amount>1000", "Name~john".
+/// Scans left-to-right for the first operator character (= ! > < ~),
+/// then determines the full operator.
+pub fn parse_filter_expr(s: &str) -> Result<FilterExpr, String> {
+ let op_chars = ['=', '!', '>', '<', '~'];
+ let pos = s
+ .find(|c: char| op_chars.contains(&c))
+ .ok_or_else(|| {
+ format!(
+ "no operator found in '{}'. Use =, !=, >, <, >=, <=, ~ or !~",
+ s
+ )
+ })?;
+ if pos == 0 {
+ return Err(format!("missing column name in '{}'", s));
+ }
+ let column = s[..pos].to_string();
+ let rest = &s[pos..];
+ let (op, op_len) = if rest.starts_with(">=") {
+ (FilterOp::Gte, 2)
+ } else if rest.starts_with("<=") {
+ (FilterOp::Lte, 2)
+ } else if rest.starts_with("!=") {
+ (FilterOp::NotEq, 2)
+ } else if rest.starts_with("!~") {
+ (FilterOp::NotContains, 2)
+ } else if rest.starts_with('>') {
+ (FilterOp::Gt, 1)
+ } else if rest.starts_with('<') {
+ (FilterOp::Lt, 1)
+ } else if rest.starts_with('=') {
+ (FilterOp::Eq, 1)
+ } else if rest.starts_with('~') {
+ (FilterOp::Contains, 1)
+ } else {
+ return Err(format!("invalid operator in '{}'", s));
+ };
+ let value = rest[op_len..].to_string();
+ Ok(FilterExpr { column, op, value })
+}
+
+/// Parse a sort spec like "Amount:desc" or "Name" (default asc).
+/// Splits on the last colon so column names containing colons are supported.
+pub fn parse_sort_spec(s: &str) -> Result<SortSpec, String> {
+ if let Some(colon_pos) = s.rfind(':') {
+ let col = &s[..colon_pos];
+ let dir = &s[colon_pos + 1..];
+ match dir.to_lowercase().as_str() {
+ "asc" => Ok(SortSpec {
+ column: col.to_string(),
+ descending: false,
+ }),
+ "desc" => Ok(SortSpec {
+ column: col.to_string(),
+ descending: true,
+ }),
+ _ => Err(format!(
+ "invalid sort direction '{}'. Use 'asc' or 'desc'",
+ dir
+ )),
+ }
+ } else {
+ Ok(SortSpec {
+ column: s.to_string(),
+ descending: false,
+ })
+ }
+}
+
+/// Convert a column letter like "A", "B", "AA" to a 0-based index.
+/// Returns None if the string isn't purely alphabetic or is empty.
+fn col_letter_to_index(s: &str) -> Option<usize> {
+ if s.is_empty() || !s.chars().all(|c| c.is_ascii_alphabetic()) {
+ return None;
+ }
+ let mut idx: usize = 0;
+ for c in s.to_uppercase().chars() {
+ idx = idx * 26 + (c as usize - 'A' as usize + 1);
+ }
+ Some(idx - 1)
+}
+
+/// Resolve a column specifier to a DataFrame column name.
+/// Accepts either:
+/// - A header name (exact match first, then case-insensitive)
+/// - A column letter like "A", "B", "AA" (mapped by position)
+/// Header name match takes priority over column letter interpretation.
+pub fn resolve_column(spec: &str, df_columns: &[String]) -> Result<String, String> {
+ // 1. Exact header name match
+ if df_columns.contains(&spec.to_string()) {
+ return Ok(spec.to_string());
+ }
+ // 2. Case-insensitive header name match
+ let spec_lower = spec.to_lowercase();
+ for col in df_columns {
+ if col.to_lowercase() == spec_lower {
+ return Ok(col.clone());
+ }
+ }
+ // 3. Column letter (A=0, B=1, ...) — only if purely alphabetic
+ if let Some(idx) = col_letter_to_index(spec) {
+ if idx < df_columns.len() {
+ return Ok(df_columns[idx].clone());
+ }
+ }
+ let available = df_columns.join(", ");
+ Err(format!("column '{}' not found. Available columns: {}", spec, available))
+}
+
+/// Resolve a list of column specifiers to DataFrame column names.
+pub fn resolve_columns(specs: &[String], df_columns: &[String]) -> Result<Vec<String>, String> {
+ specs.iter().map(|s| resolve_column(s, df_columns)).collect()
+}
+
+/// Check if a polars DataType is numeric.
+fn is_numeric_dtype(dtype: &DataType) -> bool {
+ matches!(
+ dtype,
+ DataType::Int8
+ | DataType::Int16
+ | DataType::Int32
+ | DataType::Int64
+ | DataType::UInt8
+ | DataType::UInt16
+ | DataType::UInt32
+ | DataType::UInt64
+ | DataType::Float32
+ | DataType::Float64
+ )
+}
+
+/// Build a boolean mask for a single filter expression against a DataFrame.
+fn build_filter_mask(df: &DataFrame, expr: &FilterExpr) -> Result<BooleanChunked> {
+ let col = df.column(&expr.column).map_err(|e| anyhow::anyhow!("{}", e))?;
+ let series = col.as_materialized_series();
+ let dtype = series.dtype();
+
+ match &expr.op {
+ FilterOp::Eq => {
+ if is_numeric_dtype(dtype) {
+ if let Ok(n) = expr.value.parse::<f64>() {
+ let s = series.cast(&DataType::Float64)?;
+ return Ok(s.f64()?.equal(n));
+ }
+ }
+ let s = series.cast(&DataType::String)?;
+ Ok(s.str()?.equal(expr.value.as_str()))
+ }
+ FilterOp::NotEq => {
+ if is_numeric_dtype(dtype) {
+ if let Ok(n) = expr.value.parse::<f64>() {
+ let s = series.cast(&DataType::Float64)?;
+ return Ok(s.f64()?.not_equal(n));
+ }
+ }
+ let s = series.cast(&DataType::String)?;
+ Ok(s.str()?.not_equal(expr.value.as_str()))
+ }
+ FilterOp::Gt => {
+ let n = parse_numeric_value(&expr.value, ">")?;
+ let s = series.cast(&DataType::Float64)?;
+ Ok(s.f64()?.gt(n))
+ }
+ FilterOp::Lt => {
+ let n = parse_numeric_value(&expr.value, "<")?;
+ let s = series.cast(&DataType::Float64)?;
+ Ok(s.f64()?.lt(n))
+ }
+ FilterOp::Gte => {
+ let n = parse_numeric_value(&expr.value, ">=")?;
+ let s = series.cast(&DataType::Float64)?;
+ Ok(s.f64()?.gt_eq(n))
+ }
+ FilterOp::Lte => {
+ let n = parse_numeric_value(&expr.value, "<=")?;
+ let s = series.cast(&DataType::Float64)?;
+ Ok(s.f64()?.lt_eq(n))
+ }
+ FilterOp::Contains => {
+ let s = series.cast(&DataType::String)?;
+ let ca = s.str()?;
+ let pat = expr.value.to_lowercase();
+ let mask: BooleanChunked = ca.into_iter()
+ .map(|opt_s| opt_s.map(|s| s.to_lowercase().contains(&pat)).unwrap_or(false))
+ .collect();
+ Ok(mask)
+ }
+ FilterOp::NotContains => {
+ let s = series.cast(&DataType::String)?;
+ let ca = s.str()?;
+ let pat = expr.value.to_lowercase();
+ let mask: BooleanChunked = ca.into_iter()
+ .map(|opt_s| opt_s.map(|s| !s.to_lowercase().contains(&pat)).unwrap_or(true))
+ .collect();
+ Ok(mask)
+ }
+ }
+}
+
+fn parse_numeric_value(value: &str, op: &str) -> Result<f64> {
+ value
+ .parse::<f64>()
+ .map_err(|_| anyhow::anyhow!("'{}' requires numeric value, got '{}'", op, value))
+}
+
+/// Apply a list of filter expressions to a DataFrame (AND logic).
+/// An empty list returns the DataFrame unchanged.
+pub fn apply_filters(df: &DataFrame, exprs: &[FilterExpr]) -> Result<DataFrame> {
+ let mut result = df.clone();
+ for expr in exprs {
+ let mask = build_filter_mask(&result, expr)?;
+ result = result.filter(&mask)?;
+ }
+ Ok(result)
+}
+
+/// Options for the filter pipeline.
+pub struct FilterOptions {
+ pub filters: Vec<FilterExpr>,
+ pub cols: Option<Vec<String>>,
+ pub sort: Option<SortSpec>,
+ pub limit: Option<usize>,
+ pub head: Option<usize>,
+ pub tail: Option<usize>,
+}
+
+/// Apply a sort specification to a DataFrame.
+pub fn apply_sort(df: &DataFrame, spec: &SortSpec) -> Result<DataFrame> {
+ let opts = SortMultipleOptions::default()
+ .with_order_descending(spec.descending);
+ Ok(df.sort([&spec.column], opts)?)
+}
+
+/// Run the full filter pipeline: head/tail → resolve & filter → sort → limit → select columns.
+pub fn filter_pipeline(df: DataFrame, opts: &FilterOptions) -> Result<DataFrame> {
+ let df_columns: Vec<String> = df
+ .get_column_names()
+ .iter()
+ .map(|s| s.to_string())
+ .collect();
+
+ // 1. Pre-filter window: head or tail
+ let df = if let Some(n) = opts.head {
+ df.head(Some(n))
+ } else if let Some(n) = opts.tail {
+ df.tail(Some(n))
+ } else {
+ df
+ };
+
+ // 2. Resolve column names in filter expressions and apply filters
+ let resolved_filters: Vec<FilterExpr> = opts
+ .filters
+ .iter()
+ .map(|f| {
+ let resolved_col = resolve_column(&f.column, &df_columns)?;
+ Ok(FilterExpr {
+ column: resolved_col,
+ op: f.op.clone(),
+ value: f.value.clone(),
+ })
+ })
+ .collect::<Result<Vec<_>, String>>()
+ .map_err(|e| anyhow::anyhow!("{}", e))?;
+
+ let df = apply_filters(&df, &resolved_filters)?;
+
+ // 3. Sort
+ let df = if let Some(ref spec) = opts.sort {
+ let resolved_col = resolve_column(&spec.column, &df_columns)
+ .map_err(|e| anyhow::anyhow!("{}", e))?;
+ let resolved_spec = SortSpec {
+ column: resolved_col,
+ descending: spec.descending,
+ };
+ apply_sort(&df, &resolved_spec)?
+ } else {
+ df
+ };
+
+ // 4. Limit (after filtering and sorting)
+ let df = if let Some(n) = opts.limit {
+ df.head(Some(n))
+ } else {
+ df
+ };
+
+ // 5. Select columns
+ let df = if let Some(ref col_specs) = opts.cols {
+ let resolved_cols = resolve_columns(col_specs, &df_columns)
+ .map_err(|e| anyhow::anyhow!("{}", e))?;
+ let col_refs: Vec<&str> = resolved_cols.iter().map(|s| s.as_str()).collect();
+ df.select(col_refs)?
+ } else {
+ df
+ };
+
+ Ok(df)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn make_test_df() -> DataFrame {
+ DataFrame::new(vec![
+ Column::new("State".into(), &["CA", "NY", "CA", "TX", "NY"]),
+ Column::new("City".into(), &["LA", "NYC", "SF", "Houston", "Albany"]),
+ Column::new("Amount".into(), &[1500i64, 2000, 800, 1200, 500]),
+ Column::new("Year".into(), &[2023i64, 2023, 2024, 2024, 2023]),
+ Column::new("Status".into(), &["Active", "Active", "Draft", "Active", "Draft"]),
+ ])
+ .unwrap()
+ }
+
+ #[test]
+ fn filter_eq_string() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("State=CA").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+
+ #[test]
+ fn filter_eq_numeric() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("Amount=1500").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 1);
+ }
+
+ #[test]
+ fn filter_not_eq() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("Status!=Draft").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 3);
+ }
+
+ #[test]
+ fn filter_gt() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("Amount>1000").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 3);
+ }
+
+ #[test]
+ fn filter_lt() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("Amount<1000").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+
+ #[test]
+ fn filter_gte() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("Amount>=1500").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+
+ #[test]
+ fn filter_lte() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("Amount<=800").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+
+ #[test]
+ fn filter_contains() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("City~ou").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 1);
+ }
+
+ #[test]
+ fn filter_contains_case_insensitive() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("City~HOUSTON").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 1);
+ }
+
+ #[test]
+ fn filter_not_contains() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("Status!~raft").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 3);
+ }
+
+ #[test]
+ fn filter_multiple_and() {
+ let df = make_test_df();
+ let e1 = parse_filter_expr("State=CA").unwrap();
+ let e2 = parse_filter_expr("Amount>1000").unwrap();
+ let result = apply_filters(&df, &[e1, e2]).unwrap();
+ assert_eq!(result.height(), 1);
+ }
+
+ #[test]
+ fn filter_no_matches_returns_empty() {
+ let df = make_test_df();
+ let expr = parse_filter_expr("State=ZZ").unwrap();
+ let result = apply_filters(&df, &[expr]).unwrap();
+ assert_eq!(result.height(), 0);
+ }
+
+ #[test]
+ fn filter_empty_exprs_returns_all() {
+ let df = make_test_df();
+ let result = apply_filters(&df, &[]).unwrap();
+ assert_eq!(result.height(), 5);
+ }
+
+ #[test]
+ fn parse_eq() {
+ let expr = parse_filter_expr("State=CA").unwrap();
+ assert_eq!(expr.column, "State");
+ assert_eq!(expr.op, FilterOp::Eq);
+ assert_eq!(expr.value, "CA");
+ }
+
+ #[test]
+ fn parse_not_eq() {
+ let expr = parse_filter_expr("Status!=Draft").unwrap();
+ assert_eq!(expr.column, "Status");
+ assert_eq!(expr.op, FilterOp::NotEq);
+ assert_eq!(expr.value, "Draft");
+ }
+
+ #[test]
+ fn parse_gt() {
+ let expr = parse_filter_expr("Amount>1000").unwrap();
+ assert_eq!(expr.column, "Amount");
+ assert_eq!(expr.op, FilterOp::Gt);
+ assert_eq!(expr.value, "1000");
+ }
+
+ #[test]
+ fn parse_lt() {
+ let expr = parse_filter_expr("Year<2024").unwrap();
+ assert_eq!(expr.column, "Year");
+ assert_eq!(expr.op, FilterOp::Lt);
+ assert_eq!(expr.value, "2024");
+ }
+
+ #[test]
+ fn parse_gte() {
+ let expr = parse_filter_expr("Score>=90").unwrap();
+ assert_eq!(expr.column, "Score");
+ assert_eq!(expr.op, FilterOp::Gte);
+ assert_eq!(expr.value, "90");
+ }
+
+ #[test]
+ fn parse_lte() {
+ let expr = parse_filter_expr("Price<=50.5").unwrap();
+ assert_eq!(expr.column, "Price");
+ assert_eq!(expr.op, FilterOp::Lte);
+ assert_eq!(expr.value, "50.5");
+ }
+
+ #[test]
+ fn parse_contains() {
+ let expr = parse_filter_expr("Name~john").unwrap();
+ assert_eq!(expr.column, "Name");
+ assert_eq!(expr.op, FilterOp::Contains);
+ assert_eq!(expr.value, "john");
+ }
+
+ #[test]
+ fn parse_not_contains() {
+ let expr = parse_filter_expr("Name!~draft").unwrap();
+ assert_eq!(expr.column, "Name");
+ assert_eq!(expr.op, FilterOp::NotContains);
+ assert_eq!(expr.value, "draft");
+ }
+
+ #[test]
+ fn parse_value_with_equals() {
+ let expr = parse_filter_expr("Formula=A+B=C").unwrap();
+ assert_eq!(expr.column, "Formula");
+ assert_eq!(expr.op, FilterOp::Eq);
+ assert_eq!(expr.value, "A+B=C");
+ }
+
+ #[test]
+ fn parse_empty_value() {
+ let expr = parse_filter_expr("Status=").unwrap();
+ assert_eq!(expr.column, "Status");
+ assert_eq!(expr.op, FilterOp::Eq);
+ assert_eq!(expr.value, "");
+ }
+
+ #[test]
+ fn parse_no_operator_is_err() {
+ assert!(parse_filter_expr("JustAWord").is_err());
+ }
+
+ #[test]
+ fn parse_no_column_is_err() {
+ assert!(parse_filter_expr("=value").is_err());
+ }
+
+ #[test]
+ fn parse_sort_desc() {
+ let spec = parse_sort_spec("Amount:desc").unwrap();
+ assert_eq!(spec.column, "Amount");
+ assert!(spec.descending);
+ }
+
+ #[test]
+ fn parse_sort_asc() {
+ let spec = parse_sort_spec("Name:asc").unwrap();
+ assert_eq!(spec.column, "Name");
+ assert!(!spec.descending);
+ }
+
+ #[test]
+ fn parse_sort_default_asc() {
+ let spec = parse_sort_spec("Name").unwrap();
+ assert_eq!(spec.column, "Name");
+ assert!(!spec.descending);
+ }
+
+ #[test]
+ fn parse_sort_bad_dir_is_err() {
+ assert!(parse_sort_spec("Name:up").is_err());
+ }
+
+ #[test]
+ fn resolve_by_header_name() {
+ let cols = vec!["State".to_string(), "Amount".to_string(), "Year".to_string()];
+ assert_eq!(resolve_column("Amount", &cols).unwrap(), "Amount");
+ }
+
+ #[test]
+ fn resolve_by_letter() {
+ let cols = vec!["State".to_string(), "Amount".to_string(), "Year".to_string()];
+ assert_eq!(resolve_column("B", &cols).unwrap(), "Amount");
+ }
+
+ #[test]
+ fn resolve_by_letter_lowercase() {
+ let cols = vec!["State".to_string(), "Amount".to_string(), "Year".to_string()];
+ assert_eq!(resolve_column("b", &cols).unwrap(), "Amount");
+ }
+
+ #[test]
+ fn resolve_header_takes_priority_over_letter() {
+ let cols = vec!["A".to_string(), "B".to_string()];
+ assert_eq!(resolve_column("A", &cols).unwrap(), "A");
+ }
+
+ #[test]
+ fn resolve_case_insensitive_header() {
+ let cols = vec!["State".to_string(), "Amount".to_string()];
+ assert_eq!(resolve_column("state", &cols).unwrap(), "State");
+ }
+
+ #[test]
+ fn resolve_unknown_column_is_err() {
+ let cols = vec!["State".to_string(), "Amount".to_string()];
+ let err = resolve_column("Foo", &cols).unwrap_err();
+ assert!(err.contains("not found"), "error was: {}", err);
+ }
+
+ #[test]
+ fn resolve_letter_out_of_range_is_err() {
+ let cols = vec!["State".to_string()];
+ let err = resolve_column("C", &cols).unwrap_err();
+ assert!(err.contains("not found"), "error was: {}", err);
+ }
+
+ #[test]
+ fn resolve_multiple_columns() {
+ let cols = vec!["State".to_string(), "Amount".to_string(), "Year".to_string()];
+ let resolved = resolve_columns(&["A".to_string(), "Year".to_string()], &cols).unwrap();
+ assert_eq!(resolved, vec!["State", "Year"]);
+ }
+
+ #[test]
+ fn sort_ascending() {
+ let df = make_test_df();
+ let spec = parse_sort_spec("Amount:asc").unwrap();
+ let result = apply_sort(&df, &spec).unwrap();
+ let col = result.column("Amount").unwrap().as_materialized_series();
+ let amounts = col.i64().unwrap();
+ assert_eq!(amounts.get(0), Some(500));
+ assert_eq!(amounts.get(4), Some(2000));
+ }
+
+ #[test]
+ fn sort_descending() {
+ let df = make_test_df();
+ let spec = parse_sort_spec("Amount:desc").unwrap();
+ let result = apply_sort(&df, &spec).unwrap();
+ let col = result.column("Amount").unwrap().as_materialized_series();
+ let amounts = col.i64().unwrap();
+ assert_eq!(amounts.get(0), Some(2000));
+ assert_eq!(amounts.get(4), Some(500));
+ }
+
+ #[test]
+ fn pipeline_full() {
+ let df = make_test_df();
+ let opts = FilterOptions {
+ filters: vec![parse_filter_expr("Amount>500").unwrap()],
+ cols: Some(vec!["State".to_string(), "Amount".to_string()]),
+ sort: Some(parse_sort_spec("Amount:desc").unwrap()),
+ limit: Some(2),
+ head: None,
+ tail: None,
+ };
+ let result = filter_pipeline(df, &opts).unwrap();
+ assert_eq!(result.height(), 2);
+ assert_eq!(result.width(), 2);
+ let col = result.column("Amount").unwrap().as_materialized_series();
+ let amounts = col.i64().unwrap();
+ assert_eq!(amounts.get(0), Some(2000));
+ assert_eq!(amounts.get(1), Some(1500));
+ }
+
+ #[test]
+ fn pipeline_head_before_filter() {
+ let df = make_test_df(); // 5 rows: CA/LA, NY/NYC, CA/SF, TX/Houston, NY/Albany
+ let opts = FilterOptions {
+ filters: vec![parse_filter_expr("State=NY").unwrap()],
+ cols: None,
+ sort: None,
+ limit: None,
+ head: Some(3), // Take first 3 rows before filtering
+ tail: None,
+ };
+ let result = filter_pipeline(df, &opts).unwrap();
+ // First 3 rows: CA/LA, NY/NYC, CA/SF → only NY/NYC matches
+ assert_eq!(result.height(), 1);
+ }
+
+ #[test]
+ fn pipeline_tail_before_filter() {
+ let df = make_test_df(); // 5 rows
+ let opts = FilterOptions {
+ filters: vec![parse_filter_expr("State=CA").unwrap()],
+ cols: None,
+ sort: None,
+ limit: None,
+ head: None,
+ tail: Some(3), // Last 3 rows before filtering
+ };
+ let result = filter_pipeline(df, &opts).unwrap();
+ // Last 3 rows: CA/SF, TX/Houston, NY/Albany → only CA/SF matches
+ assert_eq!(result.height(), 1);
+ }
+
+ #[test]
+ fn pipeline_no_options_returns_all() {
+ let df = make_test_df();
+ let opts = FilterOptions {
+ filters: vec![],
+ cols: None,
+ sort: None,
+ limit: None,
+ head: None,
+ tail: None,
+ };
+ let result = filter_pipeline(df, &opts).unwrap();
+ assert_eq!(result.height(), 5);
+ assert_eq!(result.width(), 5);
+ }
+
+ #[test]
+ fn pipeline_cols_by_letter() {
+ let df = make_test_df();
+ let opts = FilterOptions {
+ filters: vec![],
+ cols: Some(vec!["A".to_string(), "C".to_string()]),
+ sort: None,
+ limit: None,
+ head: None,
+ tail: None,
+ };
+ let result = filter_pipeline(df, &opts).unwrap();
+ assert_eq!(result.width(), 2);
+ let names: Vec<String> = result.get_column_names().iter().map(|s| s.to_string()).collect();
+ assert_eq!(names, vec!["State", "Amount"]);
+ }
+
+ #[test]
+ fn pipeline_limit_after_filter() {
+ let df = make_test_df();
+ let opts = FilterOptions {
+ filters: vec![parse_filter_expr("Status=Active").unwrap()],
+ cols: None,
+ sort: None,
+ limit: Some(2),
+ head: None,
+ tail: None,
+ };
+ let result = filter_pipeline(df, &opts).unwrap();
+ assert_eq!(result.height(), 2); // 3 Active rows, limited to 2
+ }
+}
diff --git a/src/formatter.rs b/src/formatter.rs
@@ -237,7 +237,7 @@ fn df_to_strings(df: &DataFrame) -> (Vec<String>, Vec<Vec<String>>) {
}
/// Compute the display width for each column.
-fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> {
+pub fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> {
let mut widths: Vec<usize> = headers.iter().map(|h| h.len().max(3)).collect();
for row in rows {
for (i, cell) in row.iter().enumerate() {
@@ -250,7 +250,7 @@ fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> {
}
/// Render a markdown table header + separator line.
-fn render_table_header(headers: &[String], widths: &[usize]) -> String {
+pub fn render_table_header(headers: &[String], widths: &[usize]) -> String {
let mut out = String::new();
out.push('|');
for (i, h) in headers.iter().enumerate() {
@@ -270,7 +270,7 @@ fn render_table_header(headers: &[String], widths: &[usize]) -> String {
}
/// Render markdown table data rows (no header).
-fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String {
+pub fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String {
let mut out = String::new();
for row in rows {
out.push('|');
@@ -284,7 +284,7 @@ fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String {
}
/// Render a complete aligned markdown table.
-fn render_table(headers: &[String], rows: &[Vec<String>]) -> String {
+pub fn render_table(headers: &[String], rows: &[Vec<String>]) -> String {
let widths = compute_col_widths(headers, rows);
let mut out = render_table_header(headers, &widths);
out.push_str(&render_table_rows(rows, &widths));
@@ -300,7 +300,7 @@ fn format_cell(col: &Column, idx: usize) -> String {
}
/// Convert an AnyValue to its display string.
-fn format_any_value(v: &AnyValue) -> String {
+pub fn format_any_value(v: &AnyValue) -> String {
match v {
AnyValue::Null => String::new(),
AnyValue::Boolean(b) => b.to_string(),
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,4 +1,6 @@
pub mod cell;
+pub mod diff;
+pub mod filter;
pub mod formatter;
pub mod metadata;
pub mod reader;
diff --git a/src/reader.rs b/src/reader.rs
@@ -23,14 +23,33 @@ pub fn read_sheet(path: &Path, sheet_name: &str) -> Result<DataFrame> {
}
pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> {
- let (total_rows, cols) = range.get_size();
- if total_rows == 0 || cols == 0 {
+ range_to_dataframe_skip(range, 0)
+}
+
+/// Read a sheet, skipping the first `skip` rows before treating the next row as the header.
+pub fn read_sheet_with_skip(path: &Path, sheet_name: &str, skip: usize) -> Result<DataFrame> {
+ let mut workbook = open_workbook_auto(path)
+ .with_context(|| format!("Cannot open workbook: {}", path.display()))?;
+ let range = workbook
+ .worksheet_range(sheet_name)
+ .with_context(|| format!("Cannot read sheet: {sheet_name}"))?;
+ range_to_dataframe_skip(&range, skip)
+}
+
+/// Convert a calamine Range to a DataFrame, skipping `skip` rows before the header.
+pub fn range_to_dataframe_skip(range: &calamine::Range<Data>, skip: usize) -> Result<DataFrame> {
+ let rows: Vec<&[Data]> = range.rows().skip(skip).collect();
+ let cols = if rows.is_empty() {
+ 0
+ } else {
+ rows.iter().map(|r| r.len()).max().unwrap_or(0)
+ };
+
+ if rows.is_empty() || cols == 0 {
return Ok(DataFrame::default());
}
- let rows: Vec<&[Data]> = range.rows().collect();
-
- // First row = headers
+ // First row (after skip) = headers
let headers: Vec<String> = rows[0]
.iter()
.enumerate()
@@ -40,7 +59,7 @@ pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> {
})
.collect();
- if total_rows == 1 {
+ if rows.len() == 1 {
// Header only, no data
let series: Vec<Column> = headers
.iter()
@@ -374,4 +393,38 @@ mod tests {
assert_eq!(df.height(), 0);
assert_eq!(df.width(), 0);
}
+
+ /// Create xlsx with metadata rows above the real header
+ fn create_with_metadata_rows(path: &std::path::Path) {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "Report Title").unwrap();
+ ws.write_string(1, 0, "Generated 2024-01-01").unwrap();
+ ws.write_string(2, 0, "Name").unwrap();
+ ws.write_string(2, 1, "Value").unwrap();
+ ws.write_string(3, 0, "Alice").unwrap();
+ ws.write_number(3, 1, 100.0).unwrap();
+ ws.write_string(4, 0, "Bob").unwrap();
+ ws.write_number(4, 1, 200.0).unwrap();
+ wb.save(path).unwrap();
+ }
+
+ #[test]
+ fn test_read_with_skip() {
+ let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
+ create_with_metadata_rows(tmp.path());
+ let df = read_sheet_with_skip(tmp.path(), "Data", 2).unwrap();
+ let col_names: Vec<String> = df.get_column_names().iter().map(|s| s.to_string()).collect();
+ assert_eq!(col_names, vec!["Name", "Value"]);
+ assert_eq!(df.height(), 2);
+ }
+
+ #[test]
+ fn test_read_with_skip_zero() {
+ let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
+ create_simple(tmp.path());
+ let df = read_sheet_with_skip(tmp.path(), "Data", 0).unwrap();
+ assert_eq!(df.height(), 5);
+ assert_eq!(df.width(), 4);
+ }
}
diff --git a/tests/common/mod.rs b/tests/common/mod.rs
@@ -107,3 +107,196 @@ pub fn create_empty_sheet(path: &Path) {
wb.add_worksheet().set_name("Blank").unwrap();
wb.save(path).unwrap();
}
+
+/// Create a test file with metadata rows above the real header.
+pub fn create_with_metadata(path: &Path) {
+ use rust_xlsxwriter::*;
+ let mut workbook = Workbook::new();
+ let sheet = workbook.add_worksheet().set_name("Data").unwrap();
+
+ // Metadata rows
+ sheet.write_string(0, 0, "Quarterly Report").unwrap();
+ sheet.write_string(1, 0, "Generated 2024-01-01").unwrap();
+
+ // Real header at row 2
+ sheet.write_string(2, 0, "Name").unwrap();
+ sheet.write_string(2, 1, "Value").unwrap();
+
+ // Data
+ sheet.write_string(3, 0, "Alice").unwrap();
+ sheet.write_number(3, 1, 100.0).unwrap();
+ sheet.write_string(4, 0, "Bob").unwrap();
+ sheet.write_number(4, 1, 200.0).unwrap();
+
+ workbook.save(path.to_str().unwrap()).unwrap();
+}
+
+/// Create a test file with diverse data for filter testing.
+/// Sheet "Data" with 6 rows: State, City, Amount, Year, Status
+pub fn create_filterable(path: &Path) {
+ let mut workbook = Workbook::new();
+ let sheet = workbook.add_worksheet().set_name("Data").unwrap();
+
+ // Headers
+ sheet.write_string(0, 0, "State").unwrap();
+ sheet.write_string(0, 1, "City").unwrap();
+ sheet.write_string(0, 2, "Amount").unwrap();
+ sheet.write_string(0, 3, "Year").unwrap();
+ sheet.write_string(0, 4, "Status").unwrap();
+
+ // Row 1: CA, Los Angeles, 1500, 2023, Active
+ sheet.write_string(1, 0, "CA").unwrap();
+ sheet.write_string(1, 1, "Los Angeles").unwrap();
+ sheet.write_number(1, 2, 1500.0).unwrap();
+ sheet.write_number(1, 3, 2023.0).unwrap();
+ sheet.write_string(1, 4, "Active").unwrap();
+
+ // Row 2: NY, New York, 2000, 2023, Active
+ sheet.write_string(2, 0, "NY").unwrap();
+ sheet.write_string(2, 1, "New York").unwrap();
+ sheet.write_number(2, 2, 2000.0).unwrap();
+ sheet.write_number(2, 3, 2023.0).unwrap();
+ sheet.write_string(2, 4, "Active").unwrap();
+
+ // Row 3: CA, San Francisco, 800, 2024, Draft
+ sheet.write_string(3, 0, "CA").unwrap();
+ sheet.write_string(3, 1, "San Francisco").unwrap();
+ sheet.write_number(3, 2, 800.0).unwrap();
+ sheet.write_number(3, 3, 2024.0).unwrap();
+ sheet.write_string(3, 4, "Draft").unwrap();
+
+ // Row 4: TX, Houston, 1200, 2024, Active
+ sheet.write_string(4, 0, "TX").unwrap();
+ sheet.write_string(4, 1, "Houston").unwrap();
+ sheet.write_number(4, 2, 1200.0).unwrap();
+ sheet.write_number(4, 3, 2024.0).unwrap();
+ sheet.write_string(4, 4, "Active").unwrap();
+
+ // Row 5: NY, Albany, 500, 2023, Draft
+ sheet.write_string(5, 0, "NY").unwrap();
+ sheet.write_string(5, 1, "Albany").unwrap();
+ sheet.write_number(5, 2, 500.0).unwrap();
+ sheet.write_number(5, 3, 2023.0).unwrap();
+ sheet.write_string(5, 4, "Draft").unwrap();
+
+ // Row 6: FL, Miami, 3000, 2024, Active
+ sheet.write_string(6, 0, "FL").unwrap();
+ sheet.write_string(6, 1, "Miami").unwrap();
+ sheet.write_number(6, 2, 3000.0).unwrap();
+ sheet.write_number(6, 3, 2024.0).unwrap();
+ sheet.write_string(6, 4, "Active").unwrap();
+
+ workbook.save(path).unwrap();
+}
+
+/// Create a pair of files for positional diff testing.
+/// File A: Name/Score — Alice/90, Bob/80, Charlie/70
+/// File B: Name/Score — Alice/90, Charlie/70, Dana/85
+/// Expected: Bob removed, Dana added
+pub fn create_diff_pair(path_a: &Path, path_b: &Path) {
+ // File A
+ {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "Name").unwrap();
+ ws.write_string(0, 1, "Score").unwrap();
+ ws.write_string(1, 0, "Alice").unwrap();
+ ws.write_number(1, 1, 90.0).unwrap();
+ ws.write_string(2, 0, "Bob").unwrap();
+ ws.write_number(2, 1, 80.0).unwrap();
+ ws.write_string(3, 0, "Charlie").unwrap();
+ ws.write_number(3, 1, 70.0).unwrap();
+ wb.save(path_a).unwrap();
+ }
+
+ // File B
+ {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "Name").unwrap();
+ ws.write_string(0, 1, "Score").unwrap();
+ ws.write_string(1, 0, "Alice").unwrap();
+ ws.write_number(1, 1, 90.0).unwrap();
+ ws.write_string(2, 0, "Charlie").unwrap();
+ ws.write_number(2, 1, 70.0).unwrap();
+ ws.write_string(3, 0, "Dana").unwrap();
+ ws.write_number(3, 1, 85.0).unwrap();
+ wb.save(path_b).unwrap();
+ }
+}
+
+/// Create a pair of files for key-based diff testing.
+/// File A: ID/Name/Score — "1"/Alice/90, "2"/Bob/80, "3"/Charlie/70
+/// File B: ID/Name/Score — "1"/Alice/95, "2"/Bob/80, "4"/Dana/85
+/// Expected: ID=1 modified (Score 90→95), ID=3 removed, ID=4 added
+pub fn create_diff_pair_with_keys(path_a: &Path, path_b: &Path) {
+ // File A
+ {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "ID").unwrap();
+ ws.write_string(0, 1, "Name").unwrap();
+ ws.write_string(0, 2, "Score").unwrap();
+ ws.write_string(1, 0, "1").unwrap();
+ ws.write_string(1, 1, "Alice").unwrap();
+ ws.write_number(1, 2, 90.0).unwrap();
+ ws.write_string(2, 0, "2").unwrap();
+ ws.write_string(2, 1, "Bob").unwrap();
+ ws.write_number(2, 2, 80.0).unwrap();
+ ws.write_string(3, 0, "3").unwrap();
+ ws.write_string(3, 1, "Charlie").unwrap();
+ ws.write_number(3, 2, 70.0).unwrap();
+ wb.save(path_a).unwrap();
+ }
+
+ // File B
+ {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "ID").unwrap();
+ ws.write_string(0, 1, "Name").unwrap();
+ ws.write_string(0, 2, "Score").unwrap();
+ ws.write_string(1, 0, "1").unwrap();
+ ws.write_string(1, 1, "Alice").unwrap();
+ ws.write_number(1, 2, 95.0).unwrap();
+ ws.write_string(2, 0, "2").unwrap();
+ ws.write_string(2, 1, "Bob").unwrap();
+ ws.write_number(2, 2, 80.0).unwrap();
+ ws.write_string(3, 0, "4").unwrap();
+ ws.write_string(3, 1, "Dana").unwrap();
+ ws.write_number(3, 2, 85.0).unwrap();
+ wb.save(path_b).unwrap();
+ }
+}
+
+/// Create a pair of files for tolerance testing.
+/// File A: ID/Price — "1"/100.001, "2"/200.5
+/// File B: ID/Price — "1"/100.002, "2"/200.6
+/// Expected with tolerance 0.01: only ID=2 modified (diff=0.1 > 0.01)
+pub fn create_diff_pair_with_floats(path_a: &Path, path_b: &Path) {
+ // File A
+ {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "ID").unwrap();
+ ws.write_string(0, 1, "Price").unwrap();
+ ws.write_string(1, 0, "1").unwrap();
+ ws.write_number(1, 1, 100.001).unwrap();
+ ws.write_string(2, 0, "2").unwrap();
+ ws.write_number(2, 1, 200.5).unwrap();
+ wb.save(path_a).unwrap();
+ }
+
+ // File B
+ {
+ let mut wb = Workbook::new();
+ let ws = wb.add_worksheet().set_name("Data").unwrap();
+ ws.write_string(0, 0, "ID").unwrap();
+ ws.write_string(0, 1, "Price").unwrap();
+ ws.write_string(1, 0, "1").unwrap();
+ ws.write_number(1, 1, 100.002).unwrap();
+ ws.write_string(2, 0, "2").unwrap();
+ ws.write_number(2, 1, 200.6).unwrap();
+ wb.save(path_b).unwrap();
+ }
+}
diff --git a/tests/test_xldiff.rs b/tests/test_xldiff.rs
@@ -0,0 +1,223 @@
+mod common;
+
+use assert_cmd::Command;
+use predicates::prelude::*;
+use tempfile::TempDir;
+
+fn xldiff() -> Command {
+ Command::cargo_bin("xldiff").unwrap()
+}
+
+// ---------------------------------------------------------------------------
+// test_no_diff
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_no_diff() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_simple(&path_a);
+ common::create_simple(&path_b);
+
+ xldiff()
+ .arg(&path_a)
+ .arg(&path_b)
+ .arg("--no-color")
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("No differences found."));
+}
+
+// ---------------------------------------------------------------------------
+// test_positional_diff
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_positional_diff() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair(&path_a, &path_b);
+
+ xldiff()
+ .arg(&path_a)
+ .arg(&path_b)
+ .arg("--no-color")
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("Removed: 1"))
+ .stdout(predicate::str::contains("Added: 1"))
+ .stdout(predicate::str::contains("Bob"));
+}
+
+// ---------------------------------------------------------------------------
+// test_keyed_diff
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_keyed_diff() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ xldiff()
+ .arg(&path_a)
+ .arg(&path_b)
+ .args(["--key", "ID", "--no-color"])
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("Modified: 1"))
+ .stdout(predicate::str::contains("Removed: 1"))
+ .stdout(predicate::str::contains("Added: 1"));
+}
+
+// ---------------------------------------------------------------------------
+// test_tolerance
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_tolerance() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_floats(&path_a, &path_b);
+
+ // With tolerance 0.01: ID=1 diff is 0.001 (within tolerance), ID=2 diff is 0.1 (exceeds)
+ xldiff()
+ .arg(&path_a)
+ .arg(&path_b)
+ .args(["--key", "ID", "--tolerance", "0.01", "--no-color"])
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("Modified: 1"));
+}
+
+// ---------------------------------------------------------------------------
+// test_json_format
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_json_format() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ xldiff()
+ .arg(&path_a)
+ .arg(&path_b)
+ .args(["--key", "ID", "--format", "json"])
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("\"added\""))
+ .stdout(predicate::str::contains("\"removed\""))
+ .stdout(predicate::str::contains("\"modified\""));
+}
+
+// ---------------------------------------------------------------------------
+// test_markdown_format
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_markdown_format() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ xldiff()
+ .arg(&path_a)
+ .arg(&path_b)
+ .args(["--key", "ID", "--format", "markdown"])
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("## Added"))
+ .stdout(predicate::str::contains("## Removed"))
+ .stdout(predicate::str::contains("## Modified"));
+}
+
+// ---------------------------------------------------------------------------
+// test_csv_format
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_csv_format() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ xldiff()
+ .arg(&path_a)
+ .arg(&path_b)
+ .args(["--key", "ID", "--format", "csv"])
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("_status"))
+ .stdout(predicate::str::contains("added"))
+ .stdout(predicate::str::contains("removed"))
+ .stdout(predicate::str::contains("modified"));
+}
+
+// ---------------------------------------------------------------------------
+// test_file_not_found
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_file_not_found() {
+ xldiff()
+ .arg("nonexistent_a.xlsx")
+ .arg("nonexistent_b.xlsx")
+ .assert()
+ .code(2)
+ .stderr(predicate::str::contains("file not found"));
+}
+
+// ---------------------------------------------------------------------------
+// test_sheet_selector
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_sheet_selector() {
+ let dir = TempDir::new().unwrap();
+ let path = dir.path().join("multi.xlsx");
+ common::create_multi_sheet(&path);
+
+ // Revenue and Expenses sheets have different schemas and data
+ let arg_revenue = format!("{}:Revenue", path.display());
+ let arg_expenses = format!("{}:Expenses", path.display());
+
+ xldiff()
+ .arg(&arg_revenue)
+ .arg(&arg_expenses)
+ .assert()
+ .code(1);
+}
+
+// ---------------------------------------------------------------------------
+// test_cols_filter
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_cols_filter() {
+ let dir = TempDir::new().unwrap();
+ let path_a = dir.path().join("a.xlsx");
+ let path_b = dir.path().join("b.xlsx");
+ common::create_diff_pair_with_keys(&path_a, &path_b);
+
+ // With --cols ID,Name: Score column is excluded from comparison.
+ // ID=1: Score changed 90→95, but Score is excluded, so Name is same → no modification.
+ // ID=3: removed, ID=4: added.
+ // Modified should be 0.
+ xldiff()
+ .arg(&path_a)
+ .arg(&path_b)
+ .args(["--key", "ID", "--cols", "ID,Name", "--no-color"])
+ .assert()
+ .code(1)
+ .stdout(predicate::str::contains("Modified: 0"))
+ .stdout(predicate::str::contains("Added: 1"))
+ .stdout(predicate::str::contains("Removed: 1"));
+}
diff --git a/tests/test_xlfilter.rs b/tests/test_xlfilter.rs
@@ -0,0 +1,384 @@
+mod common;
+
+use assert_cmd::Command;
+use predicates::prelude::*;
+use tempfile::TempDir;
+
+fn xlfilter() -> Command {
+ Command::cargo_bin("xlfilter").unwrap()
+}
+
+fn setup() -> (TempDir, std::path::PathBuf) {
+ let dir = TempDir::new().unwrap();
+ let path = dir.path().join("test.xlsx");
+ common::create_filterable(&path);
+ (dir, path)
+}
+
+// === Basic functionality ===
+
+#[test]
+fn no_flags_shows_all_rows() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("CA"))
+ .stdout(predicate::str::contains("NY"))
+ .stdout(predicate::str::contains("TX"))
+ .stdout(predicate::str::contains("FL"))
+ .stderr(predicate::str::contains("6 rows"));
+}
+
+#[test]
+fn where_eq_string() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "State=CA"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("Los Angeles"))
+ .stdout(predicate::str::contains("San Francisco"))
+ .stdout(predicate::str::contains("NY").not())
+ .stderr(predicate::str::contains("2 rows"));
+}
+
+#[test]
+fn where_gt_numeric() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "Amount>1500"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("New York"))
+ .stdout(predicate::str::contains("Miami"))
+ .stderr(predicate::str::contains("2 rows"));
+}
+
+#[test]
+fn where_multiple_and() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "State=CA", "--where", "Amount>1000"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("Los Angeles"))
+ .stdout(predicate::str::contains("San Francisco").not())
+ .stderr(predicate::str::contains("1 rows"));
+}
+
+#[test]
+fn where_not_eq() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "Status!=Draft"])
+ .assert()
+ .success()
+ .stderr(predicate::str::contains("4 rows"));
+}
+
+#[test]
+fn where_contains() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "City~angel"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("Los Angeles"))
+ .stderr(predicate::str::contains("1 rows"));
+}
+
+#[test]
+fn where_not_contains() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "Status!~raft"])
+ .assert()
+ .success()
+ .stderr(predicate::str::contains("4 rows"));
+}
+
+#[test]
+fn where_no_matches() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "State=ZZ"])
+ .assert()
+ .success()
+ .stderr(predicate::str::contains("0 rows"));
+}
+
+// === Column selection ===
+
+#[test]
+fn cols_by_name() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--cols", "State,Amount"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("State"))
+ .stdout(predicate::str::contains("Amount"))
+ .stdout(predicate::str::contains("City").not())
+ .stdout(predicate::str::contains("Year").not());
+}
+
+#[test]
+fn cols_by_letter() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--cols", "A,C"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("State"))
+ .stdout(predicate::str::contains("Amount"))
+ .stdout(predicate::str::contains("City").not());
+}
+
+#[test]
+fn cols_mixed_letter_and_name() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--cols", "A,Amount"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("State"))
+ .stdout(predicate::str::contains("Amount"))
+ .stdout(predicate::str::contains("City").not());
+}
+
+// === Sort ===
+
+#[test]
+fn sort_desc() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--sort", "Amount:desc", "--cols", "City,Amount"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("Miami")); // 3000 = highest
+}
+
+#[test]
+fn sort_asc() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--sort", "Amount:asc", "--limit", "1", "--cols", "City,Amount"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("Albany")); // 500 = lowest
+}
+
+#[test]
+fn sort_by_column_letter() {
+ let (_dir, path) = setup();
+ // C = Amount column
+ xlfilter()
+ .arg(&path)
+ .args(["--sort", "C:desc", "--limit", "1", "--cols", "City,Amount"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("Miami")); // 3000 = highest
+}
+
+// === Limit, head, tail ===
+
+#[test]
+fn limit_caps_output() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--limit", "3"])
+ .assert()
+ .success()
+ .stderr(predicate::str::contains("3 rows"));
+}
+
+#[test]
+fn head_before_filter() {
+ let (_dir, path) = setup();
+ // First 3 rows: CA/LA, NY/NYC, CA/SF
+ // Filter State=NY → only NYC
+ xlfilter()
+ .arg(&path)
+ .args(["--head", "3", "--where", "State=NY"])
+ .assert()
+ .success()
+ .stderr(predicate::str::contains("1 rows"));
+}
+
+#[test]
+fn tail_before_filter() {
+ let (_dir, path) = setup();
+ // Last 3 rows: TX/Houston, NY/Albany, FL/Miami
+ // Filter State=NY → only Albany
+ xlfilter()
+ .arg(&path)
+ .args(["--tail", "3", "--where", "State=NY"])
+ .assert()
+ .success()
+ .stderr(predicate::str::contains("1 rows"));
+}
+
+#[test]
+fn head_and_tail_mutually_exclusive() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--head", "3", "--tail", "3"])
+ .assert()
+ .failure()
+ .stderr(predicate::str::contains("mutually exclusive"));
+}
+
+// === CSV output ===
+
+#[test]
+fn csv_output() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--csv", "--where", "State=CA"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains(",")) // CSV has commas
+ .stdout(predicate::str::contains("|").not()); // no markdown pipes
+}
+
+// === Sheet selection ===
+
+#[test]
+fn sheet_by_name() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--sheet", "Data"])
+ .assert()
+ .success();
+}
+
+#[test]
+fn sheet_not_found() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--sheet", "NoSuchSheet"])
+ .assert()
+ .failure()
+ .stderr(predicate::str::contains("not found"));
+}
+
+// === Error cases ===
+
+#[test]
+fn file_not_found() {
+ xlfilter()
+ .arg("nonexistent.xlsx")
+ .assert()
+ .failure()
+ .stderr(predicate::str::contains("not found"));
+}
+
+#[test]
+fn bad_filter_expr() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "NoOperator"])
+ .assert()
+ .failure()
+ .stderr(predicate::str::contains("no operator found"));
+}
+
+#[test]
+fn bad_sort_dir() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--sort", "Amount:up"])
+ .assert()
+ .failure()
+ .stderr(predicate::str::contains("invalid sort direction"));
+}
+
+#[test]
+fn unknown_column_in_where() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "Foo=bar"])
+ .assert()
+ .failure()
+ .stderr(predicate::str::contains("not found"));
+}
+
+#[test]
+fn unknown_column_in_cols() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--cols", "State,Foo"])
+ .assert()
+ .failure()
+ .stderr(predicate::str::contains("not found"));
+}
+
+#[test]
+fn gt_with_non_numeric_value() {
+ let (_dir, path) = setup();
+ xlfilter()
+ .arg(&path)
+ .args(["--where", "Amount>abc"])
+ .assert()
+ .failure()
+ .stderr(predicate::str::contains("numeric value"));
+}
+
+// === Skip rows ===
+
+#[test]
+fn skip_metadata_rows() {
+ let dir = TempDir::new().unwrap();
+ let path = dir.path().join("meta.xlsx");
+ common::create_with_metadata(&path);
+
+ xlfilter()
+ .arg(&path)
+ .args(["--skip", "2"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("Name"))
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Bob"))
+ .stdout(predicate::str::contains("Quarterly Report").not())
+ .stderr(predicate::str::contains("2 rows"));
+}
+
+#[test]
+fn skip_with_filter() {
+ let dir = TempDir::new().unwrap();
+ let path = dir.path().join("meta.xlsx");
+ common::create_with_metadata(&path);
+
+ xlfilter()
+ .arg(&path)
+ .args(["--skip", "2", "--where", "Name=Alice"])
+ .assert()
+ .success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Bob").not())
+ .stderr(predicate::str::contains("1 rows"));
+}