reader.rs (14180B)
1 use anyhow::{Context, Result}; 2 use calamine::{open_workbook_auto, Data, Reader}; 3 use polars::prelude::*; 4 use std::path::Path; 5 6 #[derive(Debug, Clone, Copy, PartialEq)] 7 enum InferredType { 8 Int, 9 Float, 10 String, 11 Bool, 12 DateTime, 13 Empty, 14 } 15 16 pub fn read_sheet(path: &Path, sheet_name: &str) -> Result<DataFrame> { 17 let mut workbook = open_workbook_auto(path) 18 .with_context(|| format!("Cannot open workbook: {}", path.display()))?; 19 let range = workbook 20 .worksheet_range(sheet_name) 21 .with_context(|| format!("Cannot read sheet: {sheet_name}"))?; 22 range_to_dataframe(&range) 23 } 24 25 pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> { 26 range_to_dataframe_skip(range, 0) 27 } 28 29 /// Read a sheet, skipping the first `skip` rows before treating the next row as the header. 30 pub fn read_sheet_with_skip(path: &Path, sheet_name: &str, skip: usize) -> Result<DataFrame> { 31 let mut workbook = open_workbook_auto(path) 32 .with_context(|| format!("Cannot open workbook: {}", path.display()))?; 33 let range = workbook 34 .worksheet_range(sheet_name) 35 .with_context(|| format!("Cannot read sheet: {sheet_name}"))?; 36 range_to_dataframe_skip(&range, skip) 37 } 38 39 /// Convert a calamine Range to a DataFrame, skipping `skip` rows before the header. 40 pub fn range_to_dataframe_skip(range: &calamine::Range<Data>, skip: usize) -> Result<DataFrame> { 41 let rows: Vec<&[Data]> = range.rows().skip(skip).collect(); 42 let cols = if rows.is_empty() { 43 0 44 } else { 45 rows.iter().map(|r| r.len()).max().unwrap_or(0) 46 }; 47 48 if rows.is_empty() || cols == 0 { 49 return Ok(DataFrame::default()); 50 } 51 52 // First row (after skip) = headers 53 let headers: Vec<String> = rows[0] 54 .iter() 55 .enumerate() 56 .map(|(i, cell)| match cell { 57 Data::String(s) => s.clone(), 58 _ => format!("column_{i}"), 59 }) 60 .collect(); 61 62 if rows.len() == 1 { 63 // Header only, no data 64 let series: Vec<Column> = headers 65 .iter() 66 .map(|name| { 67 Series::new_empty(PlSmallStr::from(name.as_str()), &DataType::Null).into_column() 68 }) 69 .collect(); 70 return DataFrame::new(series).map_err(Into::into); 71 } 72 73 let data_rows = &rows[1..]; 74 let mut columns: Vec<Column> = Vec::with_capacity(cols); 75 76 for col_idx in 0..cols { 77 let cells: Vec<&Data> = data_rows 78 .iter() 79 .map(|row| { 80 if col_idx < row.len() { 81 &row[col_idx] 82 } else { 83 &Data::Empty 84 } 85 }) 86 .collect(); 87 88 let col_type = infer_column_type(&cells); 89 let series = build_series(&headers[col_idx], &cells, col_type)?; 90 columns.push(series.into_column()); 91 } 92 93 DataFrame::new(columns).map_err(Into::into) 94 } 95 96 fn infer_column_type(cells: &[&Data]) -> InferredType { 97 let mut has_int = false; 98 let mut has_float = false; 99 let mut has_string = false; 100 let mut has_bool = false; 101 let mut has_datetime = false; 102 let mut all_empty = true; 103 104 for cell in cells { 105 match cell { 106 Data::Empty => {} 107 Data::String(_) | Data::DateTimeIso(_) | Data::DurationIso(_) => { 108 has_string = true; 109 all_empty = false; 110 } 111 Data::Float(_) => { 112 has_float = true; 113 all_empty = false; 114 } 115 Data::Int(_) => { 116 has_int = true; 117 all_empty = false; 118 } 119 Data::Bool(_) => { 120 has_bool = true; 121 all_empty = false; 122 } 123 Data::DateTime(_) => { 124 has_datetime = true; 125 all_empty = false; 126 } 127 Data::Error(_) => { 128 has_string = true; 129 all_empty = false; 130 } 131 } 132 } 133 134 if all_empty { 135 return InferredType::Empty; 136 } 137 // String trumps everything 138 if has_string { 139 return InferredType::String; 140 } 141 // DateTime only if all non-empty cells are datetime 142 if has_datetime && !has_int && !has_float && !has_bool { 143 return InferredType::DateTime; 144 } 145 // Bool only if all non-empty cells are bool 146 if has_bool && !has_int && !has_float && !has_datetime { 147 return InferredType::Bool; 148 } 149 // Float if any float or mix of int/float 150 if has_float { 151 return InferredType::Float; 152 } 153 if has_int { 154 return InferredType::Int; 155 } 156 // Fallback: mixed datetime/bool/etc → string 157 InferredType::String 158 } 159 160 fn build_series(name: &str, cells: &[&Data], col_type: InferredType) -> Result<Series> { 161 let plname = PlSmallStr::from(name); 162 match col_type { 163 InferredType::Int => { 164 let values: Vec<Option<i64>> = cells 165 .iter() 166 .map(|cell| match cell { 167 Data::Int(v) => Some(*v), 168 Data::Empty => None, 169 _ => None, 170 }) 171 .collect(); 172 Ok(Series::new(plname, &values)) 173 } 174 InferredType::Float => { 175 let values: Vec<Option<f64>> = cells 176 .iter() 177 .map(|cell| match cell { 178 Data::Float(v) => Some(*v), 179 Data::Int(v) => Some(*v as f64), 180 Data::Empty => None, 181 _ => None, 182 }) 183 .collect(); 184 Ok(Series::new(plname, &values)) 185 } 186 InferredType::Bool => { 187 let values: Vec<Option<bool>> = cells 188 .iter() 189 .map(|cell| match cell { 190 Data::Bool(v) => Some(*v), 191 Data::Empty => None, 192 _ => None, 193 }) 194 .collect(); 195 Ok(Series::new(plname, &values)) 196 } 197 InferredType::DateTime => { 198 // calamine ExcelDateTime wraps a serial date float (days since 1899-12-30) 199 // Convert to milliseconds since Unix epoch for polars 200 let values: Vec<Option<i64>> = cells 201 .iter() 202 .map(|cell| match cell { 203 Data::DateTime(v) => { 204 let serial = v.as_f64(); 205 // Excel epoch: 1899-12-30 = -25569 days from Unix epoch 206 let days_from_unix = serial - 25569.0; 207 let ms = (days_from_unix * 86_400_000.0) as i64; 208 Some(ms) 209 } 210 Data::Empty => None, 211 _ => None, 212 }) 213 .collect(); 214 let series = Series::new(plname, &values); 215 Ok(series.cast(&DataType::Datetime(TimeUnit::Milliseconds, None))?) 216 } 217 InferredType::String | InferredType::Empty => { 218 let values: Vec<Option<String>> = cells 219 .iter() 220 .map(|cell| match cell { 221 Data::String(s) => Some(s.clone()), 222 Data::Float(v) => Some(v.to_string()), 223 Data::Int(v) => Some(v.to_string()), 224 Data::Bool(v) => Some(v.to_string()), 225 Data::DateTime(v) => Some(v.as_f64().to_string()), 226 Data::Error(e) => Some(format!("{e:?}")), 227 Data::DateTimeIso(s) | Data::DurationIso(s) => Some(s.clone()), 228 Data::Empty => None, 229 }) 230 .collect(); 231 Ok(Series::new(plname, &values)) 232 } 233 } 234 } 235 236 #[cfg(test)] 237 mod tests { 238 use super::*; 239 use rust_xlsxwriter::Workbook; 240 use tempfile::NamedTempFile; 241 242 /// Create a simple xlsx with mixed types: string, float, float (int-like), bool 243 fn create_simple(path: &std::path::Path) { 244 let mut wb = Workbook::new(); 245 let ws = wb.add_worksheet().set_name("Data").unwrap(); 246 ws.write_string(0, 0, "name").unwrap(); 247 ws.write_string(0, 1, "amount").unwrap(); 248 ws.write_string(0, 2, "count").unwrap(); 249 ws.write_string(0, 3, "active").unwrap(); 250 251 let names = ["Alice", "Bob", "Charlie", "Diana", "Eve"]; 252 let amounts = [100.50, 200.75, 300.00, 400.25, 500.00]; 253 let counts = [10.0, 20.0, 30.0, 40.0, 50.0]; 254 let active = [true, false, true, false, true]; 255 256 for i in 0..5u32 { 257 ws.write_string(i + 1, 0, names[i as usize]).unwrap(); 258 ws.write_number(i + 1, 1, amounts[i as usize]).unwrap(); 259 ws.write_number(i + 1, 2, counts[i as usize]).unwrap(); 260 ws.write_boolean(i + 1, 3, active[i as usize]).unwrap(); 261 } 262 wb.save(path).unwrap(); 263 } 264 265 /// Create xlsx with header row only, no data 266 fn create_empty_data(path: &std::path::Path) { 267 let mut wb = Workbook::new(); 268 let ws = wb.add_worksheet().set_name("Empty").unwrap(); 269 ws.write_string(0, 0, "col_a").unwrap(); 270 ws.write_string(0, 1, "col_b").unwrap(); 271 wb.save(path).unwrap(); 272 } 273 274 /// Create xlsx with a completely empty sheet 275 fn create_empty_sheet(path: &std::path::Path) { 276 let mut wb = Workbook::new(); 277 wb.add_worksheet().set_name("Blank").unwrap(); 278 wb.save(path).unwrap(); 279 } 280 281 #[test] 282 fn test_infer_int_column() { 283 let cells = vec![ 284 &Data::Int(1), 285 &Data::Int(2), 286 &Data::Empty, 287 &Data::Int(4), 288 ]; 289 assert_eq!(infer_column_type(&cells), InferredType::Int); 290 } 291 292 #[test] 293 fn test_infer_float_when_mixed_int_float() { 294 let cells = vec![ 295 &Data::Int(1), 296 &Data::Float(2.5), 297 &Data::Int(3), 298 ]; 299 assert_eq!(infer_column_type(&cells), InferredType::Float); 300 } 301 302 #[test] 303 fn test_infer_string_trumps_all() { 304 let s = Data::String("hello".to_string()); 305 let cells: Vec<&Data> = vec![ 306 &Data::Int(1), 307 &s, 308 &Data::Float(3.0), 309 ]; 310 assert_eq!(infer_column_type(&cells), InferredType::String); 311 } 312 313 #[test] 314 fn test_infer_empty_column() { 315 let cells: Vec<&Data> = vec![&Data::Empty, &Data::Empty]; 316 assert_eq!(infer_column_type(&cells), InferredType::Empty); 317 } 318 319 #[test] 320 fn test_infer_bool_column() { 321 let cells = vec![ 322 &Data::Bool(true), 323 &Data::Bool(false), 324 &Data::Empty, 325 ]; 326 assert_eq!(infer_column_type(&cells), InferredType::Bool); 327 } 328 329 #[test] 330 fn test_empty_range() { 331 let range: calamine::Range<Data> = Default::default(); 332 let df = range_to_dataframe(&range).unwrap(); 333 assert_eq!(df.height(), 0); 334 assert_eq!(df.width(), 0); 335 } 336 337 #[test] 338 fn test_range_to_dataframe_basic() { 339 let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); 340 create_simple(tmp.path()); 341 342 let df = read_sheet(tmp.path(), "Data").unwrap(); 343 assert_eq!(df.height(), 5); 344 assert_eq!(df.width(), 4); 345 346 let names: Vec<String> = df 347 .get_column_names() 348 .iter() 349 .map(|s| s.to_string()) 350 .collect(); 351 assert_eq!(names, vec!["name", "amount", "count", "active"]); 352 } 353 354 #[test] 355 fn test_read_sheet_types() { 356 let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); 357 create_simple(tmp.path()); 358 359 let df = read_sheet(tmp.path(), "Data").unwrap(); 360 361 // name column should be String 362 assert_eq!(df.column("name").unwrap().dtype(), &DataType::String); 363 364 // amount column should be Float64 365 assert_eq!(df.column("amount").unwrap().dtype(), &DataType::Float64); 366 367 // active column should be Boolean 368 assert_eq!(df.column("active").unwrap().dtype(), &DataType::Boolean); 369 } 370 371 #[test] 372 fn test_read_sheet_header_only() { 373 let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); 374 create_empty_data(tmp.path()); 375 376 let df = read_sheet(tmp.path(), "Empty").unwrap(); 377 assert_eq!(df.height(), 0); 378 assert_eq!(df.width(), 2); 379 let names: Vec<String> = df 380 .get_column_names() 381 .iter() 382 .map(|s| s.to_string()) 383 .collect(); 384 assert_eq!(names, vec!["col_a", "col_b"]); 385 } 386 387 #[test] 388 fn test_read_sheet_empty_sheet() { 389 let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); 390 create_empty_sheet(tmp.path()); 391 392 let df = read_sheet(tmp.path(), "Blank").unwrap(); 393 assert_eq!(df.height(), 0); 394 assert_eq!(df.width(), 0); 395 } 396 397 /// Create xlsx with metadata rows above the real header 398 fn create_with_metadata_rows(path: &std::path::Path) { 399 let mut wb = Workbook::new(); 400 let ws = wb.add_worksheet().set_name("Data").unwrap(); 401 ws.write_string(0, 0, "Report Title").unwrap(); 402 ws.write_string(1, 0, "Generated 2024-01-01").unwrap(); 403 ws.write_string(2, 0, "Name").unwrap(); 404 ws.write_string(2, 1, "Value").unwrap(); 405 ws.write_string(3, 0, "Alice").unwrap(); 406 ws.write_number(3, 1, 100.0).unwrap(); 407 ws.write_string(4, 0, "Bob").unwrap(); 408 ws.write_number(4, 1, 200.0).unwrap(); 409 wb.save(path).unwrap(); 410 } 411 412 #[test] 413 fn test_read_with_skip() { 414 let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); 415 create_with_metadata_rows(tmp.path()); 416 let df = read_sheet_with_skip(tmp.path(), "Data", 2).unwrap(); 417 let col_names: Vec<String> = df.get_column_names().iter().map(|s| s.to_string()).collect(); 418 assert_eq!(col_names, vec!["Name", "Value"]); 419 assert_eq!(df.height(), 2); 420 } 421 422 #[test] 423 fn test_read_with_skip_zero() { 424 let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); 425 create_simple(tmp.path()); 426 let df = read_sheet_with_skip(tmp.path(), "Data", 0).unwrap(); 427 assert_eq!(df.height(), 5); 428 assert_eq!(df.width(), 4); 429 } 430 }