tabulate.jl (6541B)
1 @testset "Tabulate" begin 2 3 # on existing dataset 4 df = dropmissing(DataFrame(PalmerPenguins.load())) 5 cols = :island 6 7 # Test that function do not error on empty 8 @test isnothing(tabulate(df[ df.island .== "Brehat", :], :sex)) 9 10 col_length = combine(groupby(df, cols), cols .=> length => :_N) 11 sort!(col_length, cols) 12 col_tab = tabulate(df, :island; out=:df); 13 sort!(col_tab, cols) 14 @test col_length._N == col_tab.freq 15 16 # test the string output 17 tab_buf = IOBuffer(tabulate(df, :island; out=:string)) 18 tab_string = String(take!(tab_buf)) 19 @test count(==('\n'), tab_string) == 5 # test number of lines expected 20 first_line = split(tab_string, '\n', limit=2)[1] 21 @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum", "Hist."]) 22 23 tab_buf = IOBuffer(tabulate(df, :island; out=:string, skip_stat=:freq_hist)) 24 tab_string = String(take!(tab_buf)) 25 @test count(==('\n'), tab_string) == 5 # test number of lines expected 26 first_line = split(tab_string, '\n', limit=2)[1] 27 @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum"]) 28 29 # test the nothing output 30 tab_stdout = tabulate(df, :island, out=:stdout) 31 @test typeof(tab_stdout) == Nothing 32 tab_stdout = stdout_string() do # had to request a convenient package for this one... 33 tabulate(df, :island, out=:stdout) 34 end 35 @test count(==('\n'), tab_stdout) == 5 # test number of lines expected 36 first_line = split(tab_stdout, '\n', limit=2)[1] 37 @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum", "Hist."]) 38 39 # test the type columns get properly passed 40 @test contains(tabulate(df, [:island, :species], group_type = [:type, :value], out=:string), 41 "island_typeof") 42 @test contains(tabulate(df, [:island, :species], group_type = [:value, :type], out=:string), 43 "species_typeof") 44 45 # test the twoway ad wide tabulate 46 df_twoway = tabulate(df, [:island, :species], format_tbl=:wide, out=:df); 47 @test names(df_twoway) == ["-", "Adelie", "Gentoo", "Chinstrap", "Total by island"] 48 @test nrow(df_twoway) == 4 49 df_twoway = tabulate(df, [:sex, :island, :species], format_tbl=:wide, out=:df); 50 @test names(df_twoway) == ["-", "--", "Adelie", "Gentoo", "Chinstrap", "Total by sex, island"] 51 @test nrow(df_twoway) == 7 52 53 # on a specific dataset (see issue #1) 54 df = DataFrame(x = [1, 2, 5, "NA", missing], y = ["a", "c", "b", "e", "d"]) 55 df_tab = tabulate(df, :x, reorder_cols=true, out=:df) 56 @test isequal(df_tab.x, df.x) 57 58 # test the group type options 59 df = DataFrame(x = [1, 2, 2, "NA", missing], y = ["c", "c", "b", "z", "d"]) 60 @test isequal( 61 tabulate(df, [:x, :y], out=:df).y, 62 sort(df.y)) 63 @test nrow(tabulate(df, [:x, :y], group_type = :value, out=:df)) == 5 64 @test nrow(tabulate(df, [:x, :y], group_type = :type, out=:df)) == 3 65 @test nrow(tabulate(df, [:x, :y], group_type = [:type, :value], out=:df)) == 4 66 @test nrow(tabulate(df, [:x, :y], group_type = [:value, :type], out=:df)) == 4 67 68 end 69 70 71 @testset "Tabulate - wide format pct" begin 72 df = dropmissing(DataFrame(PalmerPenguins.load())) 73 74 # wide format with format_stat=:pct returns a DataFrame 75 df_pct = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:df) 76 @test df_pct isa DataFrame 77 @test nrow(df_pct) == 3 78 # pct columns should not have a totals column (unlike freq) 79 @test !any(contains.(names(df_pct), "Total")) 80 81 # wide format pct as string output 82 pt = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:string) 83 @test pt isa String 84 @test length(pt) > 0 85 86 # wide format pct stdout returns nothing 87 result = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:stdout) 88 @test isnothing(result) 89 end 90 91 92 @testset "Tabulate - wide format string output" begin 93 df = dropmissing(DataFrame(PalmerPenguins.load())) 94 95 # wide freq as string 96 pt = tabulate(df, [:island, :species], format_tbl=:wide, out=:string) 97 @test pt isa String 98 @test contains(pt, "Adelie") 99 @test contains(pt, "Gentoo") 100 @test contains(pt, "Chinstrap") 101 102 # 3-column wide as string 103 pt = tabulate(df, [:sex, :island, :species], format_tbl=:wide, out=:string) 104 @test pt isa String 105 @test contains(pt, "Adelie") 106 end 107 108 109 @testset "Tabulate - missing values" begin 110 # DataFrame with missing values in the tabulated column 111 df = DataFrame(x = [1, 2, missing, 1, missing, 3]) 112 df_tab = tabulate(df, :x, out=:df) 113 @test nrow(df_tab) == 4 # 1, 2, 3, missing 114 @test sum(df_tab.freq) == 6 115 @test :freq in propertynames(df_tab) 116 @test :pct in propertynames(df_tab) 117 @test :cum in propertynames(df_tab) 118 119 # string output with missing values should not error 120 pt = tabulate(df, :x, out=:string) 121 @test pt isa String 122 @test contains(pt, "missing") 123 124 # two-column with missing 125 df = DataFrame(x = ["a", "b", missing, "a"], y = [1, 2, 3, missing]) 126 df_tab = tabulate(df, [:x, :y], out=:df) 127 @test nrow(df_tab) == 4 128 @test sum(df_tab.freq) == 4 129 end 130 131 132 @testset "Tabulate - skip_stat vector" begin 133 df = dropmissing(DataFrame(PalmerPenguins.load())) 134 135 # skip multiple stats 136 pt = tabulate(df, :island, out=:string, skip_stat=[:freq_hist, :cum]) 137 first_line = split(pt, '\n', limit=2)[1] 138 @test contains(first_line, "Freq") 139 @test contains(first_line, "Percent") 140 @test !contains(first_line, "Cum") 141 @test !contains(first_line, "Hist") 142 143 # skip just freq 144 pt = tabulate(df, :island, out=:string, skip_stat=:freq) 145 first_line = split(pt, '\n', limit=2)[1] 146 @test !contains(first_line, "Freq.") 147 @test contains(first_line, "Percent") 148 end 149 150 151 @testset "Tabulate - single row DataFrame" begin 152 df = DataFrame(x = ["only_value"]) 153 df_tab = tabulate(df, :x, out=:df) 154 @test nrow(df_tab) == 1 155 @test df_tab.freq[1] == 1 156 @test df_tab.cum[1] == 100 157 end 158 159 160 @testset "Tabulate - reorder_cols=false" begin 161 df = DataFrame(x = ["c", "a", "b", "a", "c", "c"]) 162 df_tab = tabulate(df, :x, reorder_cols=false, out=:df) 163 # without reordering, original groupby order is preserved 164 @test nrow(df_tab) == 3 165 @test sum(df_tab.freq) == 6 166 end 167 168 169 @testset "Tabulate - invalid format_stat in wide" begin 170 df = dropmissing(DataFrame(PalmerPenguins.load())) 171 @test_throws Exception tabulate(df, [:island, :species], 172 format_tbl=:wide, format_stat=:invalid, out=:df) 173 end