BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

winsorize.jl (4231B)


      1 @testset "winsorize" begin
      2 
      3     Random.seed!(3);
      4     x1 = rand(100);
      5     x2 = Vector{Union{Float64, Missing}}(rand(Float64, 100)); x2[rand(collect(1:100), 5)] .= missing;
      6 
      7 # --- tests on non-missing vectors
      8     x1_win = winsorize(x1, probs=(0.05, 0.95), verbose=true);
      9     @test findall(x1 .!= x1_win) == [4, 15, 26, 32, 40, 44, 52, 59, 64, 97]
     10 
     11     x1_win = winsorize(x1; verbose=true);
     12     @test findall(x1 .!= x1_win) == []
     13 
     14     x1_win = winsorize(x1; cutpoints=(0.01, 0.99), verbose=true)
     15     @test findall(x1 .!= x1_win) == [4, 26, 52]
     16 
     17     x1_win = winsorize(x1; cutpoints=(0, 0.9), verbose=true)
     18     @test isequal(minimum(x1), minimum(x1_win))
     19 
     20 # --- tests with some missing
     21     x2_win = winsorize(x2, probs=(0.02, 0.98), verbose=true);
     22     @test size(x2) == size(x2_win)
     23     @test findall(skipmissing(x2 .!= x2_win)) == [5, 41, 83, 91]
     24 
     25     x2_win = winsorize(x2; verbose=true)
     26     @test size(x2) == size(x2_win)
     27     @test findall(skipmissing(x2 .!= x2_win)) == []
     28 
     29     x2_win = winsorize(x2; cutpoints=(0.05, 0.95), verbose=true)
     30     @test size(x2) == size(x2_win)
     31     @test findall(skipmissing(x2 .!= x2_win)) == [5, 17, 41, 42, 65, 83, 91]
     32 
     33 # --- tests to do: with replace
     34     x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(missing, missing), verbose=true)
     35     @test size(x2) == size(x2_win)
     36     @test findall(ismissing.(x2) .!= ismissing.(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
     37 
     38     x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=missing, verbose=true)
     39     @test size(x2) == size(x2_win)
     40     @test findall(ismissing.(x2) .!= ismissing.(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
     41 
     42     x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(-1.0, 1.0), verbose=true)
     43     @test size(x2) == size(x2_win)
     44     @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
     45 
     46     # we check that this works if the type of replace is slightly different ...
     47     # maybe we want to change this ...
     48     x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(-1, 1), verbose=true)
     49     @test size(x2) == size(x2_win)
     50     @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
     51 
     52 end
     53 
     54 
     55 @testset "winsorize - custom IQR" begin
     56     Random.seed!(42)
     57     x = randn(1000)  # standard normal: outliers likely beyond ~3σ
     58 
     59     # default IQR=3 should keep most data
     60     w_default = winsorize(x)
     61     n_changed_default = count(x .!= w_default)
     62 
     63     # IQR=1 should clip more aggressively
     64     w_tight = winsorize(x, IQR=1)
     65     n_changed_tight = count(x .!= w_tight)
     66     @test n_changed_tight > n_changed_default
     67 
     68     # IQR=100 should clip almost nothing
     69     w_loose = winsorize(x, IQR=100)
     70     @test count(x .!= w_loose) == 0
     71 end
     72 
     73 
     74 @testset "winsorize - edge cases" begin
     75     # all identical values: nothing to winsorize
     76     x_same = fill(5.0, 50)
     77     w = winsorize(x_same, probs=(0.05, 0.95))
     78     @test w == x_same
     79 
     80     # single-element vector
     81     x_one = [3.14]
     82     w = winsorize(x_one, probs=(0.1, 0.9))
     83     @test w == x_one
     84 
     85     # integer vector
     86     x_int = collect(1:100)
     87     w = winsorize(x_int, probs=(0.05, 0.95))
     88     @test length(w) == 100
     89     @test minimum(w) >= minimum(x_int)
     90     @test maximum(w) <= maximum(x_int)
     91     @test count(w .!= x_int) > 0  # some values should be clipped
     92 
     93     # one-sided winsorize: only clip top
     94     Random.seed!(1)
     95     x = rand(100)
     96     w = winsorize(x, cutpoints=(minimum(x), 0.5))
     97     @test minimum(w) == minimum(x)  # bottom unchanged
     98     @test maximum(w) <= 0.5
     99 
    100     # one-sided: only clip bottom
    101     w = winsorize(x, cutpoints=(0.5, maximum(x)))
    102     @test minimum(w) >= 0.5
    103     @test maximum(w) == maximum(x)  # top unchanged
    104 end
    105 
    106 
    107 @testset "winsorize - all missing" begin
    108     x_all_missing = Vector{Union{Float64, Missing}}(fill(missing, 10))
    109     # probs path uses skipmissing which will be empty - quantile on empty should error
    110     @test_throws Exception winsorize(x_all_missing, probs=(0.05, 0.95))
    111 end
    112 
    113 
    114 @testset "winsorize - error paths" begin
    115     # empty vector
    116     @test_throws Exception winsorize(Float64[])
    117 
    118     # invalid probability bounds
    119     @test_throws Exception winsorize([1.0, 2.0, 3.0], probs=(-0.1, 0.9))
    120     @test_throws Exception winsorize([1.0, 2.0, 3.0], probs=(0.1, 1.1))
    121 end