BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

xtile.jl (4644B)


      1 @testset "xtile" begin
      2 
      3     df = dropmissing(DataFrame(PalmerPenguins.load()))
      4 
      5     # -- test on strings (lexicographic ordering)
      6     a = xtile(df.species, 2)
      7     b = xtile(df.species, 2; weights=Weights(repeat([1], inner=nrow(df))))
      8     @test a == b                                     # uniform weights == no weights
      9     @test all(0 .<= a .<= 2)                        # bins in valid range
     10     # same species must get same bin
     11     for sp in unique(df.species)
     12         @test allequal(a[df.species .== sp])
     13     end
     14     # lexicographic order: Adelie < Chinstrap < Gentoo
     15     @test a[findfirst(df.species .== "Adelie")] <= a[findfirst(df.species .== "Chinstrap")]
     16     @test a[findfirst(df.species .== "Chinstrap")] <= a[findfirst(df.species .== "Gentoo")]
     17 
     18     # -- string xtile with non-alphabetical categories
     19     s_nonalpha = ["z", "z", "z", "a", "a", "m"]
     20     result = xtile(s_nonalpha, 2)
     21     @test allequal(result[1:3])         # all "z" in same bin
     22     @test allequal(result[4:5])         # all "a" in same bin
     23     @test all(0 .<= result .<= 2)      # bins are valid
     24     # lexicographic: "a" < "m" < "z"
     25     @test result[4] <= result[6]        # "a" <= "m"
     26     @test result[6] <= result[1]        # "m" <= "z"
     27 
     28     # -- test on int
     29     a = xtile(df.flipper_length_mm, 2);
     30     @test sum(a)==173
     31     b = xtile(df.flipper_length_mm, 10);
     32     @test sum(b)==1539
     33     c = xtile(df.flipper_length_mm, 100);
     34     @test sum(c)==16923
     35     d = xtile(df.flipper_length_mm, 10, weights=Weights(repeat([1], inner=nrow(df))));
     36     @test d==b
     37     e = xtile(df.flipper_length_mm, 10, weights=Weights(rand(nrow(df))));
     38     @test sum(e.<=10)==nrow(df)
     39 
     40     # -- test on Float
     41     a = xtile(df.bill_depth_mm, 2);
     42     @test sum(a)==173
     43     b = xtile(df.bill_depth_mm, 10);
     44     @test sum(b)==1533
     45     c = xtile(df.bill_depth_mm, 100);
     46     @test sum(c)==16741
     47     d = xtile(df.bill_depth_mm, 10, weights=Weights(repeat([1], inner=nrow(df))));
     48     @test d==b
     49     e = xtile(df.bill_depth_mm, 10, weights=Weights(rand(nrow(df))));
     50     @test sum(e.<=10)==nrow(df)
     51 
     52     # -- test on Union{Missing, Float64}
     53     x_m = Vector{Union{Int64,Missing}}(collect(range(1, 1_000_000)));
     54     x_m[sample(1:length(x_m), 10_000, replace=false)] .= convert(Missing, missing);
     55     q_m = xtile(x_m, 10);
     56     # test that function works ok
     57     @test sum( ismissing.(q_m) ) == 10_000
     58     # test that it gives the same result as the skipmissing result on subset of not missing
     59     @test q_m[ .!ismissing.(q_m) ] == xtile(collect(skipmissing(x_m)), 10)
     60 
     61     # -- test on Union{Missing, AbstractString}
     62     s_m = ["a", "c", "g", missing, "e", missing, "za"]
     63     result_m = xtile(s_m, 3)
     64     @test count(ismissing, result_m) == 2           # missing preserved
     65     @test all(0 .<= skipmissing(result_m) .<= 3)   # bins in valid range
     66     # lexicographic: a < c < e < g < za
     67     non_miss_idx = findall(!ismissing, s_m)
     68     non_miss_vals = s_m[non_miss_idx]
     69     non_miss_bins = result_m[non_miss_idx]
     70     for (i, j) in zip(non_miss_idx[1:end-1], non_miss_idx[2:end])
     71         if s_m[i] < s_m[j]
     72             @test result_m[i] <= result_m[j]        # ordering preserved
     73         end
     74     end
     75 
     76 end
     77 
     78 
     79 @testset "xtile - edge cases" begin
     80 
     81     # all-missing input
     82     x_all_missing = Vector{Union{Int64, Missing}}(fill(missing, 10))
     83     result = xtile(x_all_missing, 4)
     84     @test all(ismissing, result)
     85     @test length(result) == 10
     86 
     87     # single-element vector: searchsortedlast puts the value at the last quantile
     88     result = xtile([42.0], 5)
     89     @test length(result) == 1
     90     @test result[1] isa Int
     91 
     92     result = xtile([42], 5)
     93     @test length(result) == 1
     94 
     95     result = xtile(["hello"], 3)
     96     @test length(result) == 1
     97 
     98     # single-element with missing wrapper
     99     x_single_m = Union{Int, Missing}[7]
    100     result = xtile(x_single_m, 3)
    101     @test length(result) == 1
    102     @test !ismissing(result[1])
    103 
    104     # two-element vector: results should be valid bin indices
    105     result = xtile([1.0, 2.0], 2)
    106     @test length(result) == 2
    107     @test result[1] < result[2]  # lower value gets lower bin
    108 
    109     # all identical values: all should get the same bin
    110     x_same = fill(5.0, 100)
    111     result = xtile(x_same, 10)
    112     @test allequal(result)
    113     @test length(result) == 100
    114 
    115     # n_quantiles == 1: only the max element gets bin 1 (searchsortedlast behavior)
    116     result = xtile(rand(50), 1)
    117     @test all(r -> r in (0, 1), result)
    118 
    119     # large n_quantiles: bins are bounded by n_quantiles
    120     result = xtile(rand(100), 10)
    121     @test all(r -> 0 <= r <= 10, result)
    122 
    123     # n_quantiles validation
    124     @test_throws Exception xtile([1.0, 2.0, 3.0], 0)
    125     @test_throws Exception xtile([1.0, 2.0, 3.0], -1)
    126 
    127 end