xtile.jl (4644B)
1 @testset "xtile" begin 2 3 df = dropmissing(DataFrame(PalmerPenguins.load())) 4 5 # -- test on strings (lexicographic ordering) 6 a = xtile(df.species, 2) 7 b = xtile(df.species, 2; weights=Weights(repeat([1], inner=nrow(df)))) 8 @test a == b # uniform weights == no weights 9 @test all(0 .<= a .<= 2) # bins in valid range 10 # same species must get same bin 11 for sp in unique(df.species) 12 @test allequal(a[df.species .== sp]) 13 end 14 # lexicographic order: Adelie < Chinstrap < Gentoo 15 @test a[findfirst(df.species .== "Adelie")] <= a[findfirst(df.species .== "Chinstrap")] 16 @test a[findfirst(df.species .== "Chinstrap")] <= a[findfirst(df.species .== "Gentoo")] 17 18 # -- string xtile with non-alphabetical categories 19 s_nonalpha = ["z", "z", "z", "a", "a", "m"] 20 result = xtile(s_nonalpha, 2) 21 @test allequal(result[1:3]) # all "z" in same bin 22 @test allequal(result[4:5]) # all "a" in same bin 23 @test all(0 .<= result .<= 2) # bins are valid 24 # lexicographic: "a" < "m" < "z" 25 @test result[4] <= result[6] # "a" <= "m" 26 @test result[6] <= result[1] # "m" <= "z" 27 28 # -- test on int 29 a = xtile(df.flipper_length_mm, 2); 30 @test sum(a)==173 31 b = xtile(df.flipper_length_mm, 10); 32 @test sum(b)==1539 33 c = xtile(df.flipper_length_mm, 100); 34 @test sum(c)==16923 35 d = xtile(df.flipper_length_mm, 10, weights=Weights(repeat([1], inner=nrow(df)))); 36 @test d==b 37 e = xtile(df.flipper_length_mm, 10, weights=Weights(rand(nrow(df)))); 38 @test sum(e.<=10)==nrow(df) 39 40 # -- test on Float 41 a = xtile(df.bill_depth_mm, 2); 42 @test sum(a)==173 43 b = xtile(df.bill_depth_mm, 10); 44 @test sum(b)==1533 45 c = xtile(df.bill_depth_mm, 100); 46 @test sum(c)==16741 47 d = xtile(df.bill_depth_mm, 10, weights=Weights(repeat([1], inner=nrow(df)))); 48 @test d==b 49 e = xtile(df.bill_depth_mm, 10, weights=Weights(rand(nrow(df)))); 50 @test sum(e.<=10)==nrow(df) 51 52 # -- test on Union{Missing, Float64} 53 x_m = Vector{Union{Int64,Missing}}(collect(range(1, 1_000_000))); 54 x_m[sample(1:length(x_m), 10_000, replace=false)] .= convert(Missing, missing); 55 q_m = xtile(x_m, 10); 56 # test that function works ok 57 @test sum( ismissing.(q_m) ) == 10_000 58 # test that it gives the same result as the skipmissing result on subset of not missing 59 @test q_m[ .!ismissing.(q_m) ] == xtile(collect(skipmissing(x_m)), 10) 60 61 # -- test on Union{Missing, AbstractString} 62 s_m = ["a", "c", "g", missing, "e", missing, "za"] 63 result_m = xtile(s_m, 3) 64 @test count(ismissing, result_m) == 2 # missing preserved 65 @test all(0 .<= skipmissing(result_m) .<= 3) # bins in valid range 66 # lexicographic: a < c < e < g < za 67 non_miss_idx = findall(!ismissing, s_m) 68 non_miss_vals = s_m[non_miss_idx] 69 non_miss_bins = result_m[non_miss_idx] 70 for (i, j) in zip(non_miss_idx[1:end-1], non_miss_idx[2:end]) 71 if s_m[i] < s_m[j] 72 @test result_m[i] <= result_m[j] # ordering preserved 73 end 74 end 75 76 end 77 78 79 @testset "xtile - edge cases" begin 80 81 # all-missing input 82 x_all_missing = Vector{Union{Int64, Missing}}(fill(missing, 10)) 83 result = xtile(x_all_missing, 4) 84 @test all(ismissing, result) 85 @test length(result) == 10 86 87 # single-element vector: searchsortedlast puts the value at the last quantile 88 result = xtile([42.0], 5) 89 @test length(result) == 1 90 @test result[1] isa Int 91 92 result = xtile([42], 5) 93 @test length(result) == 1 94 95 result = xtile(["hello"], 3) 96 @test length(result) == 1 97 98 # single-element with missing wrapper 99 x_single_m = Union{Int, Missing}[7] 100 result = xtile(x_single_m, 3) 101 @test length(result) == 1 102 @test !ismissing(result[1]) 103 104 # two-element vector: results should be valid bin indices 105 result = xtile([1.0, 2.0], 2) 106 @test length(result) == 2 107 @test result[1] < result[2] # lower value gets lower bin 108 109 # all identical values: all should get the same bin 110 x_same = fill(5.0, 100) 111 result = xtile(x_same, 10) 112 @test allequal(result) 113 @test length(result) == 100 114 115 # n_quantiles == 1: only the max element gets bin 1 (searchsortedlast behavior) 116 result = xtile(rand(50), 1) 117 @test all(r -> r in (0, 1), result) 118 119 # large n_quantiles: bins are bounded by n_quantiles 120 result = xtile(rand(100), 10) 121 @test all(r -> 0 <= r <= 10, result) 122 123 # n_quantiles validation 124 @test_throws Exception xtile([1.0, 2.0, 3.0], 0) 125 @test_throws Exception xtile([1.0, 2.0, 3.0], -1) 126 127 end