commit 2e6cd442cd49fdc3dffbb16a02408ae5197122e3
parent a2ca892c95b3a6fc63226a01c27e13d777256dcb
Author: Erik Loualiche <[email protected]>
Date: Tue, 20 May 2025 11:42:39 -0500
first commit with code
Diffstat:
12 files changed, 1258 insertions(+), 6 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1 +1,13 @@
+
+# ---------------------------------------------------------
+# STANDARD JULIA IGNORE
+/Manifest.toml
/Manifest*.toml
+docs/build/
+docs/node_modules
+.DS_Store
+docs/.DS_Store
+.env
+.env.gpg
+# ---------------------------------------------------------
+
diff --git a/Project.toml b/Project.toml
@@ -1,13 +1,35 @@
name = "BazerData"
uuid = "9777a11d-2328-4b97-9b51-b265bb408da6"
authors = ["Erik Loualiche"]
-version = "1.0.0-DEV"
+version = "0.7.1"
+
+[deps]
+ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
+Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
+Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
+PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
[compat]
+ColorSchemes = "3.29.0"
+Crayons = "4.1.1"
+DataFrames = "1.7.0"
+Dates = "1.11.0"
+Interpolations = "0.16.1"
+Missings = "1.2.0"
+PrettyTables = "2.4.0"
+Random = "1.11.0"
+StatsBase = "0.34.5"
julia = "1.6.7"
[extras]
+PalmerPenguins = "8b842266-38fa-440a-9b57-31493939ab85"
+StreamToString = "dc918f9c-79cc-42e6-85f1-d8b9b09632f4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets]
-test = ["Test"]
+test = ["Test", "PalmerPenguins", "StreamToString"]
diff --git a/README.md b/README.md
@@ -1,3 +1,115 @@
# BazerData
-[](https://github.com/eloualiche/BazerData.jl/actions/workflows/CI.yml?query=branch%3Amain)
+[](https://github.com/eloualiche/BazerData.jl/actions/workflows/CI.yml)
+[](https://github.com/eloualiche/BazerData.jl/actions/workflows/CI.yml)
+[](https://codecov.io/gh/eloualiche/BazerData.jl)
+
+`BazerData.jl` is a placeholder package for some functions that I use in julia frequently.
+
+So far the package provides a four functions
+
+ 1. tabulate some data ([`tabulate`](#tabulate-data))
+ 2. create category based on quantile ([`xtile`](#xtile))
+ 3. winsorize some data ([`winsorize`](#winsorize-data))
+ 4. fill unbalanced panel data ([`panel_fill`](#filling-an-unbalanced-panel))
+
+Note that as the package grow in different directions, dependencies might become overwhelming.
+The readme serves as documentation; there might be more examples inside of the test folder.
+
+## Installation
+
+`BazerData.jl` is a not yet a registered package.
+You can install it from github via
+```julia
+import Pkg
+Pkg.add(url="https://github.com/eloualiche/BazerData.jl")
+```
+
+
+## Usage
+
+### Tabulate data
+
+The `tabulate` function tries to emulate the tabulate function from stata (see oneway [here](https://www.stata.com/manuals/rtabulateoneway.pdf) or twoway [here](https://www.stata.com/manuals13/rtabulatetwoway.pdf)).
+This relies on the `DataFrames.jl` package and is useful to get a quick overview of the data.
+
+```julia
+using DataFrames
+using BazerData
+using PalmerPenguins
+
+df = DataFrame(PalmerPenguins.load())
+
+tabulate(df, :island)
+tabulate(df, [:island, :species])
+
+# If you are looking for groups by type (detect missing e.g.)
+df = DataFrame(x = [1, 2, 2, "NA", missing], y = ["c", "c", "b", "z", "d"])
+tabulate(df, [:x, :y], group_type = :type) # only types for all group variables
+tabulate(df, [:x, :y], group_type = [:value, :type]) # mix value and types
+```
+I have not implemented all the features of the stata tabulate function, but I am open to [suggestions](#3).
+
+
+### xtile
+
+See the [doc](https://eloualiche.github.io/BazerData.jl/dev/man/xtile_guide) or the [tests](test/UnitTests/xtile.jl) for examples.
+```julia
+sales = rand(10_000);
+a = xtile(sales, 10);
+b = xtile(sales, 10, weights=Weights(repeat([1], length(sales))) );
+# works on strings
+cities = [randstr() for _ in 10]
+xtile(cities, 10)
+```
+
+
+### Winsorize data
+
+This is fairly standard and I offer options to specify probabilities or cutpoints; moreover you can replace the values that are winsorized with a missing, the cutpoints, or some specific values.
+There is a [`winsor`](https://juliastats.org/StatsBase.jl/stable/robust/#StatsBase.winsor) function in StatsBase.jl but I think it's a little less full-featured.
+
+See the doc for [examples](https://eloualiche.github.io/BazerData.jl/dev/man/winsorize_guide)
+```julia
+df = DataFrame(PalmerPenguins.load())
+winsorize(df.flipper_length_mm, probs=(0.05, 0.95)) # skipmissing by default
+transform(df, :flipper_length_mm =>
+ (x->winsorize(x, probs=(0.05, 0.95), replace_value=missing)), renamecols=false)
+```
+
+
+### Filling an unbalanced panel
+
+Sometimes it is unpractical to work with unbalanced panel data.
+There are many ways to fill values between dates (what interpolation to use) and I try to implement a few of them.
+I use the function sparingly, so it has not been tested extensively.
+
+See the following example (or the test suite) for more information.
+```julia
+df_panel = DataFrame( # missing t=2 for id=1
+ id = ["a","a", "b","b", "c","c","c", "d","d","d","d"],
+ t = [Date(1990, 1, 1), Date(1990, 4, 1), Date(1990, 8, 1), Date(1990, 9, 1),
+ Date(1990, 1, 1), Date(1990, 2, 1), Date(1990, 4, 1),
+ Date(1999, 11, 10), Date(1999, 12, 21), Date(2000, 2, 5), Date(2000, 4, 1)],
+ v1 = [1,1, 1,6, 6,0,0, 1,4,11,13],
+ v2 = [1,2,3,6,6,4,5, 1,2,3,4],
+ v3 = [1,5,4,6,6,15,12.25, 21,22.5,17.2,1])
+
+panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
+ gap=Month(1), method=:backwards, uniquecheck=true, flag=true, merge=true)
+panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
+ gap=Month(1), method=:forwards, uniquecheck=true, flag=true, merge=true)
+panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
+ gap=Month(1), method=:linear, uniquecheck=true, flag=true, merge=true)
+```
+
+
+
+
+## Other stuff
+
+
+See my other package
+ - [BazerData.jl](https://github.com/eloualiche/BazerData.jl) which groups together data wrangling functions.
+ - [FinanceRoutines.jl](https://github.com/eloualiche/FinanceRoutines.jl) which is more focused and centered on working with financial data.
+ - [TigerFetch.jl](https://github.com/eloualiche/TigerFetch.jl) which simplifies downloading shape files from the Census.
diff --git a/src/BazerData.jl b/src/BazerData.jl
@@ -1,5 +1,33 @@
module BazerData
-# Write your package code here.
+# --------------------------------------------------------------------------------------------------
+import ColorSchemes: get, colorschemes
+import Crayons: @crayon_str
+import DataFrames: AbstractDataFrame, ByRow, DataFrame, groupby, combine, nrow, Not, nonunique, proprow,
+ rename, rename!, select, select!, transform, transform!, unstack
+import Dates: format, now, DatePeriod, Dates, Dates.AbstractTime, ISODateTimeFormat
+import Interpolations: Linear, Constant, Previous, Next, BSpline, interpolate
+import Missings: disallowmissing
+import PrettyTables: Crayon, ft_printf, get, Highlighter, hl_col, pretty_table
+import Random: seed!
+import StatsBase: quantile, UnitWeights, Weights
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+# Import functions
+include("PanelData.jl")
+include("StataUtils.jl")
+include("Winsorize.jl")
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+# List of exported functions
+export panel_fill, panel_fill!
+export tabulate
+export xtile
+export winsorize
+# --------------------------------------------------------------------------------------------------
end
diff --git a/src/PanelData.jl b/src/PanelData.jl
@@ -0,0 +1,175 @@
+# ------------------------------------------------------------------------------------------
+"""
+ panel_fill(
+ df::DataFrame,
+ id_var::Symbol,
+ time_var::Symbol,
+ value_var::Union{Symbol, Vector{Symbol}};
+ gap::Union{Int, DatePeriod} = 1,
+ method::Symbol = :backwards,
+ uniquecheck::Bool = true,
+ flag::Bool = false,
+ merge::Bool = false
+ )
+
+# Arguments
+- `df::AbstractDataFrame`: a panel dataset
+- `id_var::Symbol`: the individual index dimension of the panel
+- `time_var::Symbol`: the time index dimension of the panel (must be integer or a date)
+- `value_var::Union{Symbol, Vector{Symbol}}`: the set of columns we would like to fill
+
+# Keywords
+- `gap::Union{Int, DatePeriod} = 1` : the interval size for which we want to fill data
+- `method::Symbol = :backwards`: the interpolation method to fill the data
+ options are: `:backwards` (default), `:forwards`, `:linear`, `:nearest`
+ email me for other interpolations (anything from Interpolations.jl is possible)
+- `uniquecheck::Bool = true`: check if panel is clean
+- `flag::Bool = false`: flag the interpolated values
+- `merge::Bool = false`: merge the new values with the input dataset
+
+# Returns
+- `AbstractDataFrame`:
+
+# Examples
+- See tests
+"""
+function panel_fill(
+ df::DataFrame,
+ id_var::Symbol, time_var::Symbol, value_var::Union{Symbol, Vector{Symbol}};
+ gap::Union{Int, DatePeriod} = 1,
+ method::Symbol = :backwards,
+ uniquecheck::Bool = true,
+ flag::Bool = false,
+ merge::Bool = false
+ )
+
+
+ # prepare the data
+ sort!(df, [id_var, time_var])
+ if isa(value_var, Symbol)
+ value_var = [value_var]
+ end
+ if uniquecheck # check for unicity
+ any(nonunique(df, [id_var, time_var])) &&
+ (@warn "Some non unique observations in dataset")
+ end
+
+ time_var_r = join([string(time_var), "rounded"], "_") # clean up if dates
+ if typeof(gap) <: DatePeriod
+ if !(eltype(df.t) <: Dates.AbstractTime)
+ error(
+ """
+ Type of gap $(typeof(gap)) and type of time variable $(eltype(df.t)) do not match
+ """
+ )
+ else
+ df[!, time_var_r] .= floor.(df[!, time_var], gap)
+ if !(df[!, time_var_r] == df[!, time_var])
+ @warn "Using rounded time variables for consistency with gap: $gap"
+ end
+ end
+ else
+ df[!, time_var_r] .= df[!, time_var]
+ end
+
+ gdf = groupby(df, [id_var])
+ df_fill = DataFrame();
+
+ for id_gdf in eachindex(gdf)
+ subdf = gdf[id_gdf]
+ sub_fill = DataFrame()
+
+ if method == :backwards
+ interpolate_method = BSpline(Constant(Previous))
+ elseif method == :forwards
+ interpolate_method = BSpline(Constant(Next)) # # Next-neighbor interpolation
+ elseif method == :nearest
+ interpolate_method = BSpline(Constant()) # Nearest-neighbor interpolation
+ elseif method == :linear
+ interpolate_method = BSpline(Linear())
+ else
+ error(
+ """
+ Method $method not available.
+ Please choose from :backwards, :forwards, :nearest, :linear (default)
+ """
+ )
+ end
+
+ if nrow(subdf)>1 # condition for filling: at least one open
+ sort!(subdf, time_var_r)
+ rowdf_init = subdf[1, :]
+ for rowdf in eachrow(subdf)[2:end]
+
+ old_t = rowdf_init[time_var_r] # initialize the iteration
+ enum_t = rowdf[time_var_r]
+
+ t_fill = collect(range(old_t, enum_t, step=sign(enum_t-old_t) * gap))[2:end-1]
+ group_fill = DataFrame(
+ Dict(Symbol(time_var_r) => t_fill, id_var => id_gdf[1]))
+ N_fill = nrow(group_fill)
+ scale_xs = range(1, 2, N_fill+2)[2:end-1] # the scaling matrix
+
+ # this builds the interpolator and home made scales
+ interp_dict = Dict(
+ v => interpolate([rowdf_init[v], rowdf[v]], interpolate_method)
+ for v in value_var)
+ var_fill = DataFrame(
+ Dict(v => interp_dict[v].(scale_xs) for v in value_var))
+
+ # process the iteration and move on
+ sub_fill = vcat(sub_fill, hcat(group_fill, var_fill))
+ rowdf_init = rowdf;
+ end
+ end
+ df_fill = vcat(sub_fill, df_fill)
+ end
+
+ # clean up the output
+ if flag
+ df_fill[!, :flag] .= method
+ end
+ if df[!, time_var_r] == df[!, time_var]
+ rename!(df_fill, time_var_r => time_var)
+ select!(df, Not(time_var_r))
+ else # if they are not all the same we are going to fill
+ transform!(df_fill, time_var_r => time_var)
+ end
+
+ if merge
+ if flag
+ df[!, :flag] .= :original
+ end
+ return sort(vcat(df, df_fill, cols=:union), [id_var, time_var])
+ else
+ return df_fill
+ end
+
+end
+
+
+"""
+ panel_fill!(...)
+
+ Same as panel_fill but with modification in place
+"""
+function panel_fill!(
+ df::DataFrame,
+ id_var::Symbol, time_var::Symbol, value_var::Union{Symbol, Vector{Symbol}};
+ gap::Union{Int, DatePeriod} = 1,
+ method::Symbol = :backwards,
+ uniquecheck::Bool = true,
+ flag::Bool = false
+ )
+
+ df_fill = panel_fill(df, id_var, time_var, value_var,
+ gap = gap, method = method, uniquecheck = uniquecheck, flag = flag)
+ append!(df, df_fill, cols=:union)
+ sort!(df, [id_var, time_var])
+
+ return nothing
+
+end
+
+
+
diff --git a/src/StataUtils.jl b/src/StataUtils.jl
@@ -0,0 +1,529 @@
+# ------------------------------------------------------------------------------------------
+
+# StataUtils.jl
+
+# Collection of functions that replicate some stata utilities
+# ------------------------------------------------------------------------------------------
+
+
+
+# ------------------------------------------------------------------------------------------
+# List of exported functions
+# tabulate # (tab alias)
+# xtile
+# ------------------------------------------------------------------------------------------
+
+
+# ------------------------------------------------------------------------------------------
+"""
+ tabulate(df::AbstractDataFrame, cols::Union{Symbol, Array{Symbol}};
+ reorder_cols=true, out::Symbol=:stdout)
+
+This was forked from TexTables.jl and was inspired by https://github.com/matthieugomez/statar
+
+# Arguments
+- `df::AbstractDataFrame`: Input DataFrame to analyze
+- `cols::Union{Symbol, Vector{Symbol}}`: Single column name or vector of column names to tabulate
+- `group_type::Union{Symbol, Vector{Symbol}}=:value`: Specifies how to group each column:
+ - `:value`: Group by the actual values in the column
+ - `:type`: Group by the type of values in the column
+ - `Vector{Symbol}`: Vector combining `:value` and `:type` for different columns
+- `reorder_cols::Bool=true` Whether to sort the output by sortable columns
+- `format_tbl::Symbol=:long` How to present the results long or wide (stata twoway)
+- `format_stat::Symbol=:freq` Which statistics to present for format :freq or :pct
+- `skip_stat::Union{Nothing, Symbol, Vector{Symbol}}=nothing` do not print out all statistics (only for string)
+- `out::Symbol=:stdout` Output format:
+ - `:stdout` Print formatted table to standard output (returns nothing)
+ - `:df` Return the result as a DataFrame
+ - `:string` Return the formatted table as a string
+
+
+# Returns
+- `Nothing` if `out=:stdout`
+- `DataFrame` if `out=:df`
+- `String` if `out=:string`
+
+# Output Format
+The resulting table contains the following columns:
+- Specified grouping columns (from `cols`)
+- `freq`: Frequency count
+- `pct`: Percentage of total
+- `cum`: Cumulative percentage
+
+# TO DO
+allow user to specify order of columns (reorder = false flag)
+
+# Examples
+See the README for more examples
+```julia
+# Simple frequency table for one column
+tabulate(df, :country)
+
+## Group by value type
+tabulate(df, :age, group_type=:type)
+
+# Multiple columns with mixed grouping
+tabulate(df, [:country, :age], group_type=[:value, :type])
+
+# Return as DataFrame instead of printing
+result_df = tabulate(df, :country, out=:df)
+```
+
+"""
+function tabulate(
+ df::AbstractDataFrame, cols::Union{Symbol, Vector{Symbol}};
+ group_type::Union{Symbol, Vector{Symbol}}=:value,
+ reorder_cols::Bool=true,
+ format_tbl::Symbol=:long,
+ format_stat::Symbol=:freq,
+ skip_stat::Union{Nothing, Symbol, Vector{Symbol}}=nothing,
+ out::Symbol=:stdout)
+
+ if typeof(cols) <: Symbol # check if it's an array or just a point
+ N_COLS = 1
+ else
+ N_COLS = size(cols,1)
+ # error("Only accepts one variable for now ...")
+ end
+
+ if !(format_tbl ∈ [:long, :wide])
+ if size(cols, 1) == 1
+ @warn "Converting format_tbl to :long"
+ format_tbl = :long
+ else
+ @error "Table format_tbl must be :long or :wide"
+ end
+ end
+
+ if isempty(df)
+ @warn "Input Dataframe is empty ..."
+ return nothing
+ end
+
+ # Count the number of observations by `columns`: this is the main calculation
+ group_type_error_msg = """
+ \ngroup_type input must specify either ':value' or ':type' for columns;
+ options are :value, :type, or a vector combining the two;
+ see help for more information
+ """
+ if group_type == :value
+ df_out = combine(groupby(df, cols), nrow => :freq, proprow =>:pct)
+ new_cols = cols
+ elseif group_type == :type
+ name_type_cols = Symbol.(cols, "_typeof")
+ df_out = transform(df, cols .=> ByRow(typeof) .=> name_type_cols) |>
+ (d -> combine(groupby(d, name_type_cols), nrow => :freq, proprow =>:pct))
+ new_cols = name_type_cols
+ # rename!(df_out, name_type_cols .=> cols)
+ elseif typeof(group_type) <: Vector{Symbol}
+ !all(s -> s in [:value, :type], group_type) && (@error group_type_error_msg)
+ (size(group_type, 1) != size(cols, 1)) &&
+ (@error "\ngroup_type and cols must be the same size; \nsee help for more information")
+ type_cols = cols[group_type .== :type]
+ name_type_cols = Symbol.(type_cols, "_typeof")
+ group_cols = [cols[group_type .== :value]; name_type_cols]
+ df_out = transform(df, type_cols .=> ByRow(typeof) .=> name_type_cols) |>
+ (d -> combine(groupby(d, group_cols), nrow => :freq, proprow =>:pct))
+ new_cols = group_cols
+ else
+ @error group_type_error_msg
+ end
+ # resort columns based on the original order
+ new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols,
+ by= x -> findfirst(==(replace(string(x), r"_typeof$" => "")), string.(cols)) )
+
+ if reorder_cols
+ cols_sortable = [ # check whether it makes sense to sort on the variables
+ name
+ for (name, col) in pairs(eachcol(select(df_out, new_cols)))
+ if eltype(col) |> t -> hasmethod(isless, Tuple{t,t})
+ ]
+ if size(cols_sortable, 1)>0
+ cols_sortable
+ sort!(df_out, cols_sortable) # order before we build cumulative
+ end
+ end
+ transform!(df_out, :pct => cumsum => :cum, :freq => ByRow(Int) => :freq)
+ # easier to do some of the transformations on the numbers directly than using formatters
+ transform!(df_out,
+ :pct => (x -> x .* 100),
+ :cum => (x -> Int.(round.(x .* 100, digits=0))), renamecols=false)
+
+
+
+
+# ----- prepare the table
+ if format_tbl == :long
+
+ transform!(df_out, :freq => (x->text_histogram(x, width=24)) => :freq_hist)
+
+ # highlighter with gradient for the freq/pct/cum columns (rest is blue)
+ col_highlighters = vcat(
+ map(i -> (hl_col(i, crayon"cyan bold")), 1:N_COLS),
+ hl_custom_gradient(cols=(N_COLS+1), colorscheme=:Oranges_9, scale=maximum(df_out.freq)),
+ hl_custom_gradient(cols=(N_COLS+2), colorscheme=:Greens_9, scale=ceil(Int, maximum(df_out.pct))),
+ hl_custom_gradient(cols=(N_COLS+3), colorscheme=:Greens_9, scale=100),
+ )
+ col_highlighters = Tuple(x for x in col_highlighters)
+
+ col_formatters = Tuple(vcat(
+ [ ft_printf("%s", i) for i in 1:N_COLS ], # Column values
+ [
+ ft_printf("%d", N_COLS+1), # Frequency (integer)
+ ft_printf("%.1f", N_COLS+2),
+ ft_printf("%d", N_COLS+3), # Cumulative
+ ft_printf("%s", N_COLS+4) # Histogram
+ ]
+ ))
+
+ if out ∈ [:stdout, :df]
+
+ pretty_table(df_out;
+ hlines = [1],
+ vlines = [N_COLS],
+ alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c),
+ cell_alignment = reduce(push!,
+ map(i -> (i,1)=>:l, 1:N_COLS+3),
+ init=Dict{Tuple{Int64, Int64}, Symbol}()),
+ header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."],
+ formatters = col_formatters,
+ highlighters = col_highlighters,
+ vcrop_mode = :middle,
+ border_crayon = crayon"bold yellow",
+ header_crayon = crayon"bold light_green",
+ show_header = true,
+ )
+
+ if out==:stdout
+ return(nothing)
+ elseif out==:df
+ return(df_out)
+ end
+
+ elseif out==:string # this might be costly as I am regenerating the table.
+ if isnothing(skip_stat)
+ pt = pretty_table(String, df_out;
+ hlines = [1],
+ vlines = [N_COLS],
+ alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c),
+ cell_alignment = reduce(push!,
+ map(i -> (i,1)=>:l, 1:N_COLS+3),
+ init=Dict{Tuple{Int64, Int64}, Symbol}()),
+ header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."],
+ formatters = col_formatters,
+ highlighters = col_highlighters,
+ crop = :none, # no crop for string output
+ border_crayon = crayon"bold yellow",
+ header_crayon = crayon"bold light_green",
+ show_header = true,
+ )
+ else
+ col_stat = setdiff([:freq, :pct, :cum, :freq_hist],
+ isa(skip_stat, Vector) ? skip_stat : [skip_stat])
+ N_COL_STAT = size(col_stat,1)
+ header_table = vcat(string.(new_cols),
+ [Dict(:freq=>"Freq.", :pct=>"Percent", :cum=>"Cum", :freq_hist=>"Hist.")[k]
+ for k in col_stat]
+ )
+ df_sub_out = select(df_out, cols, col_stat)
+ pt = pretty_table(String, df_sub_out;
+ hlines = [1],
+ vlines = [N_COLS],
+ alignment = vcat(repeat([:l], N_COLS), repeat([:c], N_COL_STAT)),
+ cell_alignment = reduce(push!,
+ map(i -> (i,1)=>:l, 1:N_COLS+N_COL_STAT-1),
+ init=Dict{Tuple{Int64, Int64}, Symbol}()),
+ header = header_table,
+ formatters = col_formatters,
+ highlighters = col_highlighters,
+ crop = :none, # no crop for string output
+ border_crayon = crayon"bold yellow",
+ header_crayon = crayon"bold light_green",
+ show_header = true,
+ )
+ end
+
+ return(pt)
+ end
+
+ elseif format_tbl == :wide
+
+ df_out = unstack(df_out,
+ new_cols[1:(N_COLS-1)], new_cols[N_COLS], format_stat,
+ allowmissing=true)
+ # new_cols[1:(N_COLS-1)] might be more than one category
+ # new_cols[N_COLS] only one group!
+
+ N_GROUP_COLS = N_COLS - 1 # the first set of category (on the left!)
+ N_VAR_COLS = size(df_out, 2) - N_GROUP_COLS
+
+
+ if format_stat == :freq
+
+ # frequency we also show totals
+ total_row_des = "Total by $(string(new_cols[N_COLS]))"
+ total_col_des = join(vcat("Total by ", join(string.(new_cols[1:(N_COLS-1)]), ", ")))
+
+ sum_cols = sum.(skipmissing.(eachcol(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
+ row_vector = vcat([total_row_des], repeat(["-"], max(0, N_GROUP_COLS-1)), sum_cols)
+ df_out = vcat(df_out,
+ DataFrame(permutedims(row_vector)[:, end+1-size(df_out,2):end], names(df_out))
+ )
+ sum_rows = sum.(skipmissing.(eachrow(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
+ col_vector = rename(DataFrame(total = sum_rows), "total" => total_col_des)
+ df_out = hcat(df_out, col_vector)
+ rename!(df_out, [i => "-"^i for i in 1:N_GROUP_COLS])
+
+ #TODO: add a line on top
+ # blank for the group_cols
+ # name of the wide col
+ # total by for the sum col
+
+ col_highlighters = vcat(
+ map(i -> (hl_col(i, crayon"cyan bold")), 1:N_GROUP_COLS),
+ [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
+ scale = ceil(Int, maximum(skipmissing(df_out[1:end-1, i]))))
+ for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
+ hl_col(size(df_out, 2), crayon"green")
+ )
+
+ formatters = vcat(
+ [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
+ [ ft_printf("%d", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
+ [ ft_printf("%d", 1+N_GROUP_COLS+N_VAR_COLS) ]
+ )
+
+ hlines = [1, size(df_out, 1)]
+ vlines = [N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
+ alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS), [:l])
+
+
+ elseif format_stat == :pct
+
+ col_highlighters = vcat(
+ map(i -> (hl_col(i, crayon"cyan bold")), 1:N_GROUP_COLS),
+ [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
+ scale = ceil(Int, maximum(skipmissing(df_out[:, i]))) )
+ for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
+ )
+
+ formatters = vcat(
+ [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
+ [ ft_printf("%.1f", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ]
+ )
+
+ hlines = [1]
+ vlines = [0, N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
+ alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS))
+
+
+ end
+
+ col_highlighters = Tuple(x for x in col_highlighters)
+
+ if out ∈ [:stdout, :df]
+
+ pretty_table(df_out;
+ hlines = hlines,
+ vlines = vlines,
+ alignment = alignment,
+ cell_alignment = reduce(push!,
+ map(i -> (i,1)=>:l, 1:N_GROUP_COLS),
+ init=Dict{Tuple{Int64, Int64}, Symbol}()),
+ formatters = Tuple(formatters),
+ highlighters = col_highlighters,
+ vcrop_mode = :middle,
+ border_crayon = crayon"bold yellow",
+ header_crayon = crayon"bold light_green",
+ show_header = true,
+ show_subheader=false,
+ )
+
+ if out==:stdout
+ return(nothing)
+ elseif out==:df
+ return(df_out)
+ end
+ elseif out==:string
+ pt = pretty_table(String, df_out;
+ hlines = hlines,
+ vlines = vlines,
+ alignment = alignment,
+ cell_alignment = reduce(push!,
+ map(i -> (i,1)=>:l, 1:N_GROUP_COLS),
+ init=Dict{Tuple{Int64, Int64}, Symbol}()),
+ formatters = Tuple(formatters),
+ highlighters = col_highlighters,
+ crop = :none, # no crop for string output
+ border_crayon = crayon"bold yellow",
+ header_crayon = crayon"bold light_green",
+ show_header = true,
+ show_subheader = false,
+ )
+
+ return(pt)
+ end
+ end
+
+
+end
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+function hl_custom_gradient(;
+ cols::Int=0,
+ colorscheme::Symbol=:Oranges_9,
+ scale::Int=1)
+
+ Highlighter(
+ (data, i, j) -> j == cols,
+ (h, data, i, j) -> begin
+ if ismissing(data[i, j])
+ return Crayon(foreground=(128, 128, 128)) # Use a default color for missing values
+ end
+ color = get(colorschemes[colorscheme], data[i, j], (0, scale))
+ return Crayon(foreground=(round(Int, color.r * 255),
+ round(Int, color.g * 255),
+ round(Int, color.b * 255)))
+ end
+)
+
+end
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+# From https://github.com/mbauman/Sparklines.jl/blob/master/src/Sparklines.jl
+# Sparklines.jl
+# const ticks = ['▁','▂','▃','▄','▅','▆','▇','█']
+# function spark(x)
+# min, max = extrema(x)
+# f = div((max - min) * 2^8, length(ticks)-1)
+# f < 1 && (f = one(typeof(f)))
+# idxs = convert(Vector{Int}, map(v -> div(v, f), (x .- min) * 2^8))
+# return string.(ticks[idxs.+1])
+# end
+
+# Unicode characters:
+# █ (Full block, U+2588)
+# ⣿ (Full Braille block, U+28FF)
+# ▓ (Dark shade, U+2593)
+# ▒ (Medium shade, U+2592)
+# ░ (Light shade, U+2591)
+# ◼ (Small black square, U+25FC)
+
+function text_histogram(frequencies; width=12)
+ blocks = [" ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█"]
+ max_freq = maximum(frequencies)
+ max_freq == 0 && return fill(" " ^ width, length(frequencies))
+ scale = (width * 8 - 1) / max_freq # Subtract 1 to ensure we don't exceed width
+
+ function bar(f)
+ units = round(Int, f * scale)
+ full_blocks = div(units, 8)
+ remainder = units % 8
+ rpad(repeat("█", full_blocks) * blocks[remainder + 1], width)
+ end
+ bar.(frequencies)
+end
+# --------------------------------------------------------------------------------------------------
+
+
+
+# --------------------------------------------------------------------------------------------------
+
+"""
+ xtile(data::Vector{T}, n_quantiles::Integer,
+ weights::Union{Vector{Float64}, Nothing}=nothing)::Vector{Int} where T <: Real
+
+Create quantile groups using Julia's built-in weighted quantile functionality.
+
+# Arguments
+- `data`: Values to group
+- `n_quantiles`: Number of groups
+- `weights`: Optional weights of weight type (StatasBase)
+
+# Examples
+```julia
+sales = rand(10_000);
+a = xtile(sales, 10);
+b = xtile(sales, 10, weights=Weights(repeat([1], length(sales))) );
+@assert a == b
+```
+"""
+function xtile(
+ data::AbstractVector{T},
+ n_quantiles::Integer;
+ weights::Union{Weights{<:Real}, Nothing} = nothing
+)::Vector{Int} where T <: Real
+
+ N = length(data)
+ n_quantiles > N && (@warn "More quantiles than data")
+
+ probs = range(0, 1, length=n_quantiles + 1)[2:end]
+ if weights === nothing
+ weights = UnitWeights{T}(N)
+ end
+ cuts = quantile(collect(data), weights, probs)
+
+ return searchsortedlast.(Ref(cuts), data)
+end
+
+# String version
+function xtile(
+ data::AbstractVector{T},
+ n_quantiles::Integer;
+ weights::Union{Weights{<:Real}, Nothing} = nothing
+)::Vector{Int} where T <: AbstractString
+
+ if weights === nothing
+ weights = UnitWeights{Int}(length(data))
+ end
+ # Assign weights to each category
+ category_weights = [sum(weights[data .== category]) for category in unique(data)]
+ # Sort categories based on the weighted cumulative sum
+ sorted_categories = sortperm(category_weights, rev=true)
+ step = max(1, round(Int, length(sorted_categories) / n_quantiles))
+ cuts = unique(data)[sorted_categories][1:step:end]
+
+ return searchsortedlast.(Ref(cuts), data)
+
+end
+
+# Dealing with missing and Numbers
+function xtile(
+ data::AbstractVector{T},
+ n_quantiles::Integer;
+ weights::Union{Weights{<:Real}, Nothing} = nothing
+)::Vector{Union{Int, Missing}} where {T <: Union{Missing, AbstractString, Number}}
+
+ # Determine the non-missing type
+ non_missing_type = Base.nonmissingtype(T)
+
+ # Identify valid (non-missing) data
+ data_notmissing_idx = findall(!ismissing, data)
+
+ if isempty(data_notmissing_idx) # If all values are missing, return all missing
+ return fill(missing, length(data))
+ end
+
+ # Use @view to avoid unnecessary allocations but convert explicitly to non-missing type
+ valid_data = convert(Vector{non_missing_type}, @view data[data_notmissing_idx])
+ valid_weights = weights === nothing ? nothing : Weights(@view weights[data_notmissing_idx])
+
+ # Compute quantile groups on valid data
+ valid_result = xtile(valid_data, n_quantiles; weights=valid_weights)
+
+ # Allocate result array with correct type
+ result = Vector{Union{Int, Missing}}(missing, length(data))
+ result[data_notmissing_idx] .= valid_result # Assign computed quantile groups
+
+ return result
+end
+# --------------------------------------------------------------------------------------------------
+
+
+
+
+
diff --git a/src/Winsorize.jl b/src/Winsorize.jl
@@ -0,0 +1,72 @@
+# ------------------------------------------------------------------------------------------
+"""
+ winsorize(
+ x::AbstractVector;
+ probs::Union{Tuple{Real, Real}, Nothing} = nothing,
+ cutpoints::Union{Tuple{Real, Real}, Nothing} = nothing,
+ replace::Symbol = :missing
+ verbose::Bool=false
+ )
+
+# Arguments
+- `x::AbstractVector`: a vector of values
+
+# Keywords
+- `probs::Union{Tuple{Real, Real}, Nothing}`: A vector of probabilities that can be used instead of cutpoints
+- `cutpoints::Union{Tuple{Real, Real}, Nothing}`: Cutpoints under and above which are defined outliers. Default is (median - five times interquartile range, median + five times interquartile range). Compared to bottom and top percentile, this takes into account the whole distribution of the vector
+- `replace_value::Tuple`: Values by which outliers are replaced. Default to cutpoints. A frequent alternative is missing.
+- `IQR::Real`: when inferring cutpoints what is the multiplier from the median for the interquartile range. (median ± IQR * (q75-q25))
+- `verbose::Bool`: printing level
+
+# Returns
+- `AbstractVector`: A vector the size of x with substituted values
+
+# Examples
+- See tests
+
+This code is based on Matthieu Gomez winsorize function in the `statar` R package
+"""
+function winsorize(x::AbstractVector{T};
+ probs::Union{Tuple{Real, Real}, Nothing} = nothing,
+ cutpoints::Union{Tuple{Union{T, Real}, Union{T, Real}}, Nothing} = nothing,
+ replace_value::Union{Tuple{Union{T, Real}, Union{T, Real}}, Tuple{Missing, Missing}, Nothing, Missing} = nothing,
+ IQR::Real=3,
+ verbose::Bool=false
+ ) where T
+
+ if !isnothing(probs)
+ lower_percentile = max(minimum(probs), 0)
+ upper_percentile = min(maximum(probs), 1)
+ (lower_percentile<0 || upper_percentile>1) && @error "bad probability input"
+ verbose && any(ismissing, x) && (@info "Some missing data skipped in winsorizing")
+ verbose && !isnothing(cutpoints) && (@info "input cutpoints ignored ... using probabilities")
+
+ cut_lo = (lower_percentile==0) ? minimum(skipmissing(x)) : quantile(skipmissing(x), lower_percentile)
+ cut_hi = (upper_percentile==1) ? maximum(skipmissing(x)) : quantile(skipmissing(x), upper_percentile)
+ cutpoints = (cut_lo, cut_hi)
+
+ elseif isnothing(cutpoints)
+ verbose && any(ismissing, x) && (@info "Some missing data skipped in winsorizing")
+ l = quantile(skipmissing(x), [0.25, 0.50, 0.75])
+ cutpoints = (l[2] - IQR * (l[3]-l[1]), l[2] + IQR * (l[3]-l[1]) )
+ verbose && @info "Inferred cutpoints are ... $cutpoints (using interquartile range x $IQR from median)"
+ end
+
+ if isnothing(replace_value) # default to cutpoints
+ replace_value = (minimum(cutpoints), maximum(cutpoints))
+ replace_value = convert.(Union{T, eltype(replace_value)}, replace_value)
+ elseif ismissing(replace_value)
+ replace_value = (missing, missing)
+ end
+
+ if any(ismissing.(replace_value))
+ y = Vector{Union{T, Missing}}(x) # Make a copy of x that can also store missing values
+ else
+ y = Vector{Union{T, eltype(replace_value)}}(x) # TODO could be faster using views here ...
+ end
+
+ y[findall(skipmissing(x .< cutpoints[1]))] .= replace_value[1];
+ y[findall(skipmissing(x .> cutpoints[2]))] .= replace_value[2];
+
+ return y
+end
diff --git a/test/UnitTests/panel_fill.jl b/test/UnitTests/panel_fill.jl
@@ -0,0 +1,95 @@
+@testset "panel_fill" begin
+
+# include("./src/PanelData.jl")
+
+ df1 = DataFrame( # missing t=2 for id=1
+ id = [1,1,2,2,2],
+ t = [1,4,1,2,4],
+ a = [1,1,1,0,0])
+
+ df2 = DataFrame( # missing t=2 for id=1
+ id = ["a","a","b","b","c","c","c"],
+ t = [1,4,8,9,1,2,4],
+ v1 = [1,1,1,6,6,0,0],
+ v2 = [1,2,3,6,6,4,5],
+ v3 = [1,5,4,6,6,15,12.25])
+
+ df3 = DataFrame( # missing t=2 for id=1
+ id = ["a","a", "b","b", "c","c","c", "d","d","d","d"],
+ t = [Date(1990, 1, 1), Date(1990, 4, 1), Date(1990, 8, 1), Date(1990, 9, 1),
+ Date(1990, 1, 1), Date(1990, 2, 1), Date(1990, 4, 1),
+ Date(1999, 11, 10), Date(1999, 12, 21), Date(2000, 2, 5), Date(2000, 4, 1)],
+ v1 = [1,1, 1,6, 6,0,0, 1,4,11,13],
+ v2 = [1,2,3,6,6,4,5, 1,2,3,4],
+ v3 = [1,5,4,6,6,15,12.25, 21,22.5,17.2,1])
+
+ # --- test for df1
+ @testset "DF1" begin
+ df1_test = panel_fill(df1, :id, :t, :a,
+ gap=1, method=:backwards, uniquecheck=true, flag=true)
+ @test isequal(select(df1_test, :a),
+ DataFrame(a = [0.0, 1.0, 1.0]))
+ # TODO clean up this t est
+ df1_test = panel_fill(df1, :id, :t, :a,
+ gap=1, method=:backwards, uniquecheck=true, flag=true, merge=true)
+ @test isequal(nrow(df1_test), 8)
+ end
+
+ # --- test for df2 multiple variables
+ @testset "DF2" begin
+ df2_test = panel_fill(df2, :id, :t, [:v1, :v2, :v3],
+ gap=1, method=:backwards, uniquecheck=true, flag=true)
+ @test isequal(select(df2_test, r"v"),
+ DataFrame(v1 = [0.0, 1.0, 1.0], v2 = [4.0, 1.0, 1.], v3 = [15.0, 1.0, 1.0]))
+
+ df2_test = panel_fill(df2, :id, :t, :v1,
+ gap=1, method=:backwards, uniquecheck=true, flag=true, merge=true)
+ @test isequal((nrow(df2_test), nrow(filter(:v2 => !ismissing, df2_test))),
+ (10, 7))
+ end
+
+
+ # --- test for df3 multiple variables and dates
+ @testset "DF3" begin
+ # test with dates backwards
+ df3_test = panel_fill(df3, :id, :t, [:v1, :v2, :v3],
+ gap=Month(1), method=:backwards, uniquecheck=true, flag=true)
+ @test isequal(select(df3_test, :v1, :v2, :v3),
+ DataFrame(v1 = [4.0, 11.0, 0.0, 1.0, 1.0], v2 = [2.0, 3.0, 4.0, 1.0, 1.0],
+ v3 = [22.5, 17.2, 15.0, 1.0, 1.0]))
+
+ # test in place with dates forwards and only fill some variables and not others
+ df3_test = copy(df3)
+ panel_fill!(df3_test, :id, :t, [:v2],
+ gap=Month(1), method=:forwards, uniquecheck=true, flag=true)
+ @test isequal(
+ select(subset(df3_test, :flag => ByRow(==(:forwards)), skipmissing=true), :v1, :v2),
+ DataFrame(v1 = repeat([missing], inner=5), v2 = [2.0, 2.0, 5.0, 3.0, 4.0]))
+
+ # linear interpolation
+ df3_test = panel_fill(df3, :id, :t, [:v1, :v2, :v3],
+ gap=Month(1), method=:linear, uniquecheck=true, flag=true, merge=false)
+ @test isapprox(select(df3_test, r"v"),
+ DataFrame(v1 = [7.5 , 12.0, 0.0, 1.0, 1.0], v2 = [2.5, 3.5, 4.5, 1.333, 1.666],
+ v3 = [19.85, 9.1, 13.625, 2.3333, 3.666]),
+ atol = 0.01)
+
+ # nearest
+ df3_test = panel_fill(df3, :id, :t, :v1,
+ gap=Month(1), method=:nearest, uniquecheck=true, flag=true, merge=false)
+ @test isequal(select(df3_test, :v1), DataFrame(v1 = [11.0, 13.0, 0.0, 1.0, 1.0]))
+
+ # TODO clean up these tests
+
+ # -- different time periods
+ # this fails
+ # panel_fill(df3, :id, :t, [:v1, :v2, :v3],
+ # gap=Month(2), method=:backwards, uniquecheck=true, flag=true, merge=true)
+ df3_test = panel_fill(df3, :id, :t, [:v1, :v2, :v3],
+ gap=Day(10), method=:forwards, uniquecheck=true, flag=true, merge=true)
+ @test isequal(nrow(df3_test) , 39)
+
+ end
+
+
+end
diff --git a/test/UnitTests/tabulate.jl b/test/UnitTests/tabulate.jl
@@ -0,0 +1,71 @@
+@testset "Tabulate" begin
+
+ # on existing dataset
+ df = dropmissing(DataFrame(PalmerPenguins.load()))
+ cols = :island
+
+ # Test that function do not error on empty
+ @test isnothing(tabulate(df[ df.island .== "Brehat", :], :sex))
+
+ col_length = combine(groupby(df, cols), cols .=> length => :_N)
+ sort!(col_length, cols)
+ col_tab = tabulate(df, :island; out=:df);
+ sort!(col_tab, cols)
+ @test col_length._N == col_tab.freq
+
+ # test the string output
+ tab_buf = IOBuffer(tabulate(df, :island; out=:string))
+ tab_string = String(take!(tab_buf))
+ @test count(==('\n'), tab_string) == 5 # test number of lines expected
+ first_line = split(tab_string, '\n', limit=2)[1]
+ @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum", "Hist."])
+
+ tab_buf = IOBuffer(tabulate(df, :island; out=:string, skip_stat=:freq_hist))
+ tab_string = String(take!(tab_buf))
+ @test count(==('\n'), tab_string) == 5 # test number of lines expected
+ first_line = split(tab_string, '\n', limit=2)[1]
+ @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum"])
+
+ # test the nothing output
+ tab_stdout = tabulate(df, :island, out=:stdout)
+ @test typeof(tab_stdout) == Nothing
+ tab_stdout = stdout_string() do # had to request a convenient package for this one...
+ tabulate(df, :island, out=:stdout)
+ end
+ @test count(==('\n'), tab_stdout) == 5 # test number of lines expected
+ first_line = split(tab_stdout, '\n', limit=2)[1]
+ @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum", "Hist."])
+
+ # test the type columns get properly passed
+ @test contains(tabulate(df, [:island, :species], group_type = [:type, :value], out=:string),
+ "island_typeof")
+ @test contains(tabulate(df, [:island, :species], group_type = [:value, :type], out=:string),
+ "species_typeof")
+
+ # test the twoway ad wide tabulate
+ df_twoway = tabulate(df, [:island, :species], format_tbl=:wide, out=:df);
+ @test names(df_twoway) == ["-", "Adelie", "Gentoo", "Chinstrap", "Total by island"]
+ @test nrow(df_twoway) == 4
+ df_twoway = tabulate(df, [:sex, :island, :species], format_tbl=:wide, out=:df);
+ @test names(df_twoway) == ["-", "--", "Adelie", "Gentoo", "Chinstrap", "Total by sex, island"]
+ @test nrow(df_twoway) == 7
+
+ # on a specific dataset (see issue #1)
+ df = DataFrame(x = [1, 2, 5, "NA", missing], y = ["a", "c", "b", "e", "d"])
+ df_tab = tabulate(df, :x, reorder_cols=true, out=:df)
+ @test isequal(df_tab.x, df.x)
+
+ # test the group type options
+ df = DataFrame(x = [1, 2, 2, "NA", missing], y = ["c", "c", "b", "z", "d"])
+ @test isequal(
+ tabulate(df, [:x, :y], out=:df).y,
+ sort(df.y))
+ @test nrow(tabulate(df, [:x, :y], group_type = :value, out=:df)) == 5
+ @test nrow(tabulate(df, [:x, :y], group_type = :type, out=:df)) == 3
+ @test nrow(tabulate(df, [:x, :y], group_type = [:type, :value], out=:df)) == 4
+ @test nrow(tabulate(df, [:x, :y], group_type = [:value, :type], out=:df)) == 4
+
+end
+
+
+# -- TODO: Add tests for results that include missing +
\ No newline at end of file
diff --git a/test/UnitTests/winsorize.jl b/test/UnitTests/winsorize.jl
@@ -0,0 +1,54 @@
+@testset "winsorize" begin
+
+ Random.seed!(3);
+ x1 = rand(100);
+ x2 = Vector{Union{Float64, Missing}}(rand(Float64, 100)); x2[rand(collect(1:100), 5)] .= missing;
+
+# --- tests on non-missing vectors
+ x1_win = winsorize(x1, probs=(0.05, 0.95), verbose=true);
+ @test findall(x1 .!= x1_win) == [4, 15, 26, 32, 40, 44, 52, 59, 64, 97]
+
+ x1_win = winsorize(x1; verbose=true);
+ @test findall(x1 .!= x1_win) == []
+
+ x1_win = winsorize(x1; cutpoints=(0.01, 0.99), verbose=true)
+ @test findall(x1 .!= x1_win) == [4, 26, 52]
+
+ x1_win = winsorize(x1; cutpoints=(0, 0.9), verbose=true)
+ @test isequal(minimum(x1), minimum(x1_win))
+
+# --- tests with some missing
+ x2_win = winsorize(x2, probs=(0.02, 0.98), verbose=true);
+ @test size(x2) == size(x2_win)
+ @test findall(skipmissing(x2 .!= x2_win)) == [5, 41, 83, 91]
+
+ x2_win = winsorize(x2; verbose=true)
+ @test size(x2) == size(x2_win)
+ @test findall(skipmissing(x2 .!= x2_win)) == []
+
+ x2_win = winsorize(x2; cutpoints=(0.05, 0.95), verbose=true)
+ @test size(x2) == size(x2_win)
+ @test findall(skipmissing(x2 .!= x2_win)) == [5, 17, 41, 42, 65, 83, 91]
+
+# --- tests to do: with replace
+ x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(missing, missing), verbose=true)
+ @test size(x2) == size(x2_win)
+ @test findall(ismissing.(x2) .!= ismissing.(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
+
+ x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=missing, verbose=true)
+ @test size(x2) == size(x2_win)
+ @test findall(ismissing.(x2) .!= ismissing.(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
+
+ x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(-1.0, 1.0), verbose=true)
+ @test size(x2) == size(x2_win)
+ @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
+
+ # we check that this works if the type of replace is slightly different ...
+ # maybe we want to change this ...
+ x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(-1, 1), verbose=true)
+ @test size(x2) == size(x2_win)
+ @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
+
+
+
+end
diff --git a/test/UnitTests/xtile.jl b/test/UnitTests/xtile.jl
@@ -0,0 +1,55 @@
+@testset "xtile" begin
+
+ df = dropmissing(DataFrame(PalmerPenguins.load()))
+
+ # -- test on strings!
+ a = xtile(df.species, 2);
+ b = xtile(df.species, 2; weights=Weights(repeat([1], inner=nrow(df))));
+ @test a==b
+ @test sum(a)==520
+
+ # -- test for more xtile than categories
+ a = xtile(df.species, 4);
+ b = xtile(df.species, 5);
+ @test a==b
+
+ # -- test on int
+ a = xtile(df.flipper_length_mm, 2);
+ @test sum(a)==173
+ b = xtile(df.flipper_length_mm, 10);
+ @test sum(b)==1539
+ c = xtile(df.flipper_length_mm, 100);
+ @test sum(c)==16923
+ d = xtile(df.flipper_length_mm, 10, weights=Weights(repeat([1], inner=nrow(df))));
+ @test d==b
+ e = xtile(df.flipper_length_mm, 10, weights=Weights(rand(nrow(df))));
+ @test sum(e.<=10)==nrow(df)
+
+ # -- test on Float
+ a = xtile(df.bill_depth_mm, 2);
+ @test sum(a)==173
+ b = xtile(df.bill_depth_mm, 10);
+ @test sum(b)==1533
+ c = xtile(df.bill_depth_mm, 100);
+ @test sum(c)==16741
+ d = xtile(df.bill_depth_mm, 10, weights=Weights(repeat([1], inner=nrow(df))));
+ @test d==b
+ e = xtile(df.bill_depth_mm, 10, weights=Weights(rand(nrow(df))));
+ @test sum(e.<=10)==nrow(df)
+
+ # -- test on Union{Missing, Float64}
+ x_m = Vector{Union{Int64,Missing}}(collect(range(1, 1_000_000)));
+ x_m[sample(1:length(x_m), 10_000, replace=false)] .= convert(Missing, missing);
+ q_m = xtile(x_m, 10);
+ # test that function works ok
+ @test sum( ismissing.(q_m) ) == 10_000
+ # test that it gives the same result as the skipmissing result on subset of not missing
+ @test q_m[ .!ismissing.(q_m) ] == xtile(collect(skipmissing(x_m)), 10)
+
+ # -- test on Union{Missing, AbstractString}
+ s_m = ["a", "c", "g", missing, "e", missing, "za"]
+ @test isequal(xtile(s_m, 3), [1, 1, 2, missing, 1, missing, 3])
+ @test isequal(xtile(s_m, 20), [1, 2, 4, missing, 2, missing, 5])
+
+
+end+
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,6 +1,31 @@
+# --------------------------------------------------------------------------------------------------
using BazerData
using Test
-@testset "BazerData.jl" begin
- # Write your tests here.
+using PalmerPenguins
+using DataFrames
+using Dates
+using Random
+import StatsBase: quantile, Weights, sample
+using StreamToString
+
+const testsuite = [
+ "tabulate", "xtile", "winsorize", "panel_fill",
+]
+
+ENV["DATADEPS_ALWAYS_ACCEPT"] = true # for data loading of PalmerPenguins
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+printstyled("Running tests:\n", color=:blue, bold=true)
+
+@testset verbose=true "BazerData.jl" begin
+ for test in testsuite
+ println("\033[1m\033[32m → RUNNING\033[0m: $(test)")
+ include("UnitTests/$test.jl")
+ println("\033[1m\033[32m PASSED\033[0m")
+ end
end
+# --------------------------------------------------------------------------------------------------
+