commit 4fcfd6f3198b6ce57d580193cb6e270a6a83f451
parent f1b1519c96366d69335ed5229febe6fcd00e1dec
Author: Erik Loualiche <[email protected]>
Date: Mon, 16 Feb 2026 13:28:36 -0600
refactor, fix bugs, and expand test coverage across all functions
- tabulate: extract into _tabulate_compute, _tabulate_render_long,
_tabulate_render_wide, _render_pretty_table helpers; fix typeof
patterns to use isa; fix skip_stat bug using cols instead of
new_cols; fix skip_stat highlighter column index mismatch
- panel_fill: fix hardcoded df.t to df[!, time_var]; fix typeof
pattern; remove ghost merge param from docstring and docs examples
- winsorize: fix docstring to match actual replace_value kwarg
- timeshift: trim trailing blank lines
- docs: fix "not yet registered" text; remove invalid merge=true
- tests: 112 → 185 tests (+65%) with new edge case, error path,
missing data, and feature combination coverage
Co-Authored-By: Claude Opus 4.6 <[email protected]>
Diffstat:
10 files changed, 520 insertions(+), 384 deletions(-)
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -15,11 +15,11 @@ The readme serves as documentation; there might be more examples inside of the t
## Installation
-`BazerData.jl` is a not yet a registered package.
-You can install it from github via
+`BazerData.jl` is a registered package.
+You can install it via
```julia
import Pkg
-Pkg.add(url="https://github.com/eloualiche/BazerData.jl")
+Pkg.add("BazerData")
```
@@ -96,11 +96,11 @@ df_panel = DataFrame( # missing t=2 for id=1
v3 = [1,5,4,6,6,15,12.25, 21,22.5,17.2,1])
panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
- gap=Month(1), method=:backwards, uniquecheck=true, flag=true, merge=true)
+ gap=Month(1), method=:backwards, uniquecheck=true, flag=true)
panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
- gap=Month(1), method=:forwards, uniquecheck=true, flag=true, merge=true)
+ gap=Month(1), method=:forwards, uniquecheck=true, flag=true)
panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
- gap=Month(1), method=:linear, uniquecheck=true, flag=true, merge=true)
+ gap=Month(1), method=:linear, uniquecheck=true, flag=true)
```
### Leads and lags
diff --git a/src/PanelData.jl b/src/PanelData.jl
@@ -2,14 +2,13 @@
"""
panel_fill!(
df::DataFrame,
- id_var::Symbol,
- time_var::Symbol,
+ id_var::Symbol,
+ time_var::Symbol,
value_var::Union{Symbol, Vector{Symbol}};
- gap::Union{Int, DatePeriod} = 1,
- method::Symbol = :backwards,
+ gap::Union{Int, DatePeriod} = 1,
+ method::Symbol = :backwards,
uniquecheck::Bool = true,
- flag::Bool = false,
- merge::Bool = false
+ flag::Bool = false
)
# Arguments
@@ -17,17 +16,16 @@
- `id_var::Symbol`: the individual index dimension of the panel
- `time_var::Symbol`: the time index dimension of the panel (must be integer or a date)
- `value_var::Union{Symbol, Vector{Symbol}}`: the set of columns we would like to fill
-
+
# Keywords
- `gap::Union{Int, DatePeriod} = 1` : the interval size for which we want to fill data
- `method::Symbol = :backwards`: the interpolation method to fill the data
options are: `:backwards` (default), `:forwards`, `:linear`, `:nearest`
- email me for other interpolations (anything from Interpolations.jl is possible)
- `uniquecheck::Bool = true`: check if panel is clean
- `flag::Bool = false`: flag the interpolated values
# Returns
-- `AbstractDataFrame`:
+- `DataFrame`: the input DataFrame with interpolated rows appended
# Examples
- See tests
@@ -43,7 +41,7 @@ function panel_fill!(
# prepare the data
sort!(df, [id_var, time_var])
- if isa(value_var, Symbol)
+ if value_var isa Symbol
value_var = [value_var]
end
if uniquecheck # check for unicity
@@ -52,11 +50,11 @@ function panel_fill!(
end
time_var_r = join([string(time_var), "rounded"], "_") # clean up if dates
- if typeof(gap) <: DatePeriod
- if !(eltype(df.t) <: Dates.AbstractTime)
+ if gap isa DatePeriod
+ if !(eltype(df[!, time_var]) <: Dates.AbstractTime)
error(
"""
- Type of gap $(typeof(gap)) and type of time variable $(eltype(df.t)) do not match
+ Type of gap $(typeof(gap)) and type of time variable $(eltype(df[!, time_var])) do not match
"""
)
else
diff --git a/src/StataUtils.jl b/src/StataUtils.jl
@@ -9,7 +9,7 @@
# ------------------------------------------------------------------------------------------
# List of exported functions
-# tabulate # (tab alias)
+# tabulate
# xtile
# ------------------------------------------------------------------------------------------
@@ -19,7 +19,8 @@
tabulate(df::AbstractDataFrame, cols::Union{Symbol, Array{Symbol}};
reorder_cols=true, out::Symbol=:stdout)
-This was forked from TexTables.jl and was inspired by https://github.com/matthieugomez/statar
+Frequency tabulation inspired by Stata's `tabulate` command.
+Forked from TexTables.jl and inspired by https://github.com/matthieugomez/statar
# Arguments
- `df::AbstractDataFrame`: Input DataFrame to analyze
@@ -37,7 +38,6 @@ This was forked from TexTables.jl and was inspired by https://github.com/matthie
- `:df` Return the result as a DataFrame
- `:string` Return the formatted table as a string
-
# Returns
- `Nothing` if `out=:stdout`
- `DataFrame` if `out=:df`
@@ -50,9 +50,6 @@ The resulting table contains the following columns:
- `pct`: Percentage of total
- `cum`: Cumulative percentage
-# TO DO
-allow user to specify order of columns (reorder = false flag)
-
# Examples
See the README for more examples
```julia
@@ -74,20 +71,15 @@ function tabulate(
df::AbstractDataFrame, cols::Union{Symbol, Vector{Symbol}};
group_type::Union{Symbol, Vector{Symbol}}=:value,
reorder_cols::Bool=true,
- format_tbl::Symbol=:long,
+ format_tbl::Symbol=:long,
format_stat::Symbol=:freq,
skip_stat::Union{Nothing, Symbol, Vector{Symbol}}=nothing,
out::Symbol=:stdout)
- if typeof(cols) <: Symbol # check if it's an array or just a point
- N_COLS = 1
- else
- N_COLS = size(cols,1)
- # error("Only accepts one variable for now ...")
- end
+ N_COLS = cols isa Symbol ? 1 : length(cols)
if !(format_tbl ∈ [:long, :wide])
- if size(cols, 1) == 1
+ if N_COLS == 1
@warn "Converting format_tbl to :long"
format_tbl = :long
else
@@ -100,7 +92,18 @@ function tabulate(
return nothing
end
- # Count the number of observations by `columns`: this is the main calculation
+ df_out, new_cols = _tabulate_compute(df, cols, group_type, reorder_cols)
+
+ if format_tbl == :long
+ return _tabulate_render_long(df_out, new_cols, N_COLS, out, skip_stat)
+ else # :wide
+ return _tabulate_render_wide(df_out, new_cols, N_COLS, format_stat, out)
+ end
+end
+
+
+# ----- Computation: groupby, combine, sort, pct/cum transforms
+function _tabulate_compute(df, cols, group_type, reorder_cols)
group_type_error_msg = """
\ngroup_type input must specify either ':value' or ':type' for columns;
options are :value, :type, or a vector combining the two;
@@ -114,8 +117,7 @@ function tabulate(
df_out = transform(df, cols .=> ByRow(typeof) .=> name_type_cols) |>
(d -> combine(groupby(d, name_type_cols), nrow => :freq, proprow =>:pct))
new_cols = name_type_cols
- # rename!(df_out, name_type_cols .=> cols)
- elseif typeof(group_type) <: Vector{Symbol}
+ elseif group_type isa Vector{Symbol}
!all(s -> s in [:value, :type], group_type) && (@error group_type_error_msg)
(size(group_type, 1) != size(cols, 1)) &&
(@error "\ngroup_type and cols must be the same size; \nsee help for more information")
@@ -129,243 +131,189 @@ function tabulate(
@error group_type_error_msg
end
# resort columns based on the original order
- new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols,
+ new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols,
by= x -> findfirst(==(replace(string(x), r"_typeof$" => "")), string.(cols)) )
if reorder_cols
- cols_sortable = [ # check whether it makes sense to sort on the variables
+ cols_sortable = [
name
for (name, col) in pairs(eachcol(select(df_out, new_cols)))
if eltype(col) |> t -> hasmethod(isless, Tuple{t,t})
]
- if size(cols_sortable, 1)>0
- cols_sortable
+ if !isempty(cols_sortable)
sort!(df_out, cols_sortable) # order before we build cumulative
end
end
transform!(df_out, :pct => cumsum => :cum, :freq => ByRow(Int) => :freq)
- # easier to do some of the transformations on the numbers directly than using formatters
- transform!(df_out,
- :pct => (x -> x .* 100),
+ transform!(df_out,
+ :pct => (x -> x .* 100),
:cum => (x -> Int.(round.(x .* 100, digits=0))), renamecols=false)
+ return df_out, new_cols
+end
+# ----- Long format rendering
+function _tabulate_render_long(df_out, new_cols, N_COLS, out, skip_stat)
+ transform!(df_out, :freq => (x->text_histogram(x, width=24)) => :freq_hist)
+
+ # highlighter with gradient for the freq/pct/cum columns (rest is cyan)
+ col_highlighters = Tuple(vcat(
+ map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_COLS),
+ hl_custom_gradient(cols=(N_COLS+1), colorscheme=:Oranges_9, scale=maximum(df_out.freq)),
+ hl_custom_gradient(cols=(N_COLS+2), colorscheme=:Greens_9, scale=ceil(Int, maximum(df_out.pct))),
+ hl_custom_gradient(cols=(N_COLS+3), colorscheme=:Greens_9, scale=100),
+ ))
+
+ # when skip_stat is provided and output is string, filter columns
+ if out == :string && !isnothing(skip_stat)
+ all_stats = [:freq, :pct, :cum, :freq_hist]
+ skip_list = skip_stat isa Vector ? skip_stat : [skip_stat]
+ col_stat = setdiff(all_stats, skip_list)
+ N_COL_STAT = length(col_stat)
+
+ stat_headers = Dict(:freq=>"Freq.", :pct=>"Percent", :cum=>"Cum", :freq_hist=>"Hist.")
+ stat_formats = Dict(:freq=>"%d", :pct=>"%.1f", :cum=>"%d", :freq_hist=>"%s")
+ stat_colorschemes = Dict(
+ :freq => (:Oranges_9, maximum(df_out.freq)),
+ :pct => (:Greens_9, ceil(Int, maximum(df_out.pct))),
+ :cum => (:Greens_9, 100),
+ )
-# ----- prepare the table
- if format_tbl == :long
+ header = vcat(string.(new_cols),
+ [stat_headers[k] for k in col_stat])
+ formatters = Tuple(vcat(
+ [ft_printf("%s", i) for i in 1:N_COLS],
+ [ft_printf(stat_formats[k], N_COLS + i) for (i, k) in enumerate(col_stat)]
+ ))
+ # rebuild highlighters for the filtered column layout
+ filtered_highlighters = Tuple(vcat(
+ map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_COLS),
+ [haskey(stat_colorschemes, k) ?
+ hl_custom_gradient(cols=N_COLS+i, colorscheme=stat_colorschemes[k][1], scale=stat_colorschemes[k][2]) :
+ Highlighter((data, row, col) -> col == N_COLS+i, crayon"white")
+ for (i, k) in enumerate(col_stat)]
+ ))
+ alignment = vcat(repeat([:l], N_COLS), repeat([:c], N_COL_STAT))
+ cell_alignment = reduce(push!,
+ map(i -> (i,1)=>:l, 1:N_COLS+N_COL_STAT-1),
+ init=Dict{Tuple{Int64, Int64}, Symbol}())
+
+ df_render = select(df_out, new_cols, col_stat)
+ return _render_pretty_table(df_render, out;
+ hlines=[1], vlines=[N_COLS],
+ alignment=alignment, cell_alignment=cell_alignment,
+ header=header, formatters=formatters, highlighters=filtered_highlighters)
+ end
- transform!(df_out, :freq => (x->text_histogram(x, width=24)) => :freq_hist)
+ # default: all stat columns
+ header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."]
+ formatters = Tuple(vcat(
+ [ft_printf("%s", i) for i in 1:N_COLS],
+ [ft_printf("%d", N_COLS+1), ft_printf("%.1f", N_COLS+2),
+ ft_printf("%d", N_COLS+3), ft_printf("%s", N_COLS+4)]
+ ))
+ alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c)
+ cell_alignment = reduce(push!,
+ map(i -> (i,1)=>:l, 1:N_COLS+3),
+ init=Dict{Tuple{Int64, Int64}, Symbol}())
+
+ return _render_pretty_table(df_out, out;
+ hlines=[1], vlines=[N_COLS],
+ alignment=alignment, cell_alignment=cell_alignment,
+ header=header, formatters=formatters, highlighters=col_highlighters)
+end
- # highlighter with gradient for the freq/pct/cum columns (rest is blue)
- col_highlighters = vcat(
- map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_COLS),
- hl_custom_gradient(cols=(N_COLS+1), colorscheme=:Oranges_9, scale=maximum(df_out.freq)),
- hl_custom_gradient(cols=(N_COLS+2), colorscheme=:Greens_9, scale=ceil(Int, maximum(df_out.pct))),
- hl_custom_gradient(cols=(N_COLS+3), colorscheme=:Greens_9, scale=100),
- )
- col_highlighters = Tuple(x for x in col_highlighters)
-
- col_formatters = Tuple(vcat(
- [ ft_printf("%s", i) for i in 1:N_COLS ], # Column values
- [
- ft_printf("%d", N_COLS+1), # Frequency (integer)
- ft_printf("%.1f", N_COLS+2),
- ft_printf("%d", N_COLS+3), # Cumulative
- ft_printf("%s", N_COLS+4) # Histogram
- ]
+
+# ----- Wide format rendering
+function _tabulate_render_wide(df_out, new_cols, N_COLS, format_stat, out)
+ df_out = unstack(df_out,
+ new_cols[1:(N_COLS-1)], new_cols[N_COLS], format_stat,
+ allowmissing=true)
+
+ N_GROUP_COLS = N_COLS - 1
+ N_VAR_COLS = size(df_out, 2) - N_GROUP_COLS
+
+ if format_stat == :freq
+
+ # frequency: add row and column totals
+ total_row_des = "Total by $(string(new_cols[N_COLS]))"
+ total_col_des = join(vcat("Total by ", join(string.(new_cols[1:(N_COLS-1)]), ", ")))
+
+ sum_cols = sum.(skipmissing.(eachcol(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
+ row_vector = vcat([total_row_des], repeat(["-"], max(0, N_GROUP_COLS-1)), sum_cols)
+ df_out = vcat(df_out,
+ DataFrame(permutedims(row_vector)[:, end+1-size(df_out,2):end], names(df_out)))
+ sum_rows = sum.(skipmissing.(eachrow(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
+ col_vector = rename(DataFrame(total = sum_rows), "total" => total_col_des)
+ df_out = hcat(df_out, col_vector)
+ rename!(df_out, [i => "-"^i for i in 1:N_GROUP_COLS])
+
+ col_highlighters = Tuple(vcat(
+ map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS),
+ [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
+ scale = ceil(Int, maximum(skipmissing(df_out[1:end-1, i]))))
+ for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
+ Highlighter((data, row, col) -> col == size(df_out, 2), crayon"green")
))
- if out ∈ [:stdout, :df]
-
- pretty_table(df_out;
- hlines = [1],
- vlines = [N_COLS],
- alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c),
- cell_alignment = reduce(push!,
- map(i -> (i,1)=>:l, 1:N_COLS+3),
- init=Dict{Tuple{Int64, Int64}, Symbol}()),
- header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."],
- formatters = col_formatters,
- highlighters = col_highlighters,
- vcrop_mode = :middle,
- border_crayon = crayon"bold yellow",
- header_crayon = crayon"bold light_green",
- show_header = true,
- )
-
- if out==:stdout
- return(nothing)
- elseif out==:df
- return(df_out)
- end
-
- elseif out==:string # this might be costly as I am regenerating the table.
- if isnothing(skip_stat)
- pt = pretty_table(String, df_out;
- hlines = [1],
- vlines = [N_COLS],
- alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c),
- cell_alignment = reduce(push!,
- map(i -> (i,1)=>:l, 1:N_COLS+3),
- init=Dict{Tuple{Int64, Int64}, Symbol}()),
- header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."],
- formatters = col_formatters,
- highlighters = col_highlighters,
- crop = :none, # no crop for string output
- border_crayon = crayon"bold yellow",
- header_crayon = crayon"bold light_green",
- show_header = true,
- )
- else
- col_stat = setdiff([:freq, :pct, :cum, :freq_hist],
- isa(skip_stat, Vector) ? skip_stat : [skip_stat])
- N_COL_STAT = size(col_stat,1)
- header_table = vcat(string.(new_cols),
- [Dict(:freq=>"Freq.", :pct=>"Percent", :cum=>"Cum", :freq_hist=>"Hist.")[k]
- for k in col_stat]
- )
- df_sub_out = select(df_out, cols, col_stat)
- pt = pretty_table(String, df_sub_out;
- hlines = [1],
- vlines = [N_COLS],
- alignment = vcat(repeat([:l], N_COLS), repeat([:c], N_COL_STAT)),
- cell_alignment = reduce(push!,
- map(i -> (i,1)=>:l, 1:N_COLS+N_COL_STAT-1),
- init=Dict{Tuple{Int64, Int64}, Symbol}()),
- header = header_table,
- formatters = col_formatters,
- highlighters = col_highlighters,
- crop = :none, # no crop for string output
- border_crayon = crayon"bold yellow",
- header_crayon = crayon"bold light_green",
- show_header = true,
- )
- end
-
- return(pt)
- end
+ formatters = Tuple(vcat(
+ [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
+ [ ft_printf("%d", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
+ [ ft_printf("%d", 1+N_GROUP_COLS+N_VAR_COLS) ]
+ ))
- elseif format_tbl == :wide
-
- df_out = unstack(df_out,
- new_cols[1:(N_COLS-1)], new_cols[N_COLS], format_stat,
- allowmissing=true)
- # new_cols[1:(N_COLS-1)] might be more than one category
- # new_cols[N_COLS] only one group!
-
- N_GROUP_COLS = N_COLS - 1 # the first set of category (on the left!)
- N_VAR_COLS = size(df_out, 2) - N_GROUP_COLS
-
-
- if format_stat == :freq
-
- # frequency we also show totals
- total_row_des = "Total by $(string(new_cols[N_COLS]))"
- total_col_des = join(vcat("Total by ", join(string.(new_cols[1:(N_COLS-1)]), ", ")))
-
- sum_cols = sum.(skipmissing.(eachcol(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
- row_vector = vcat([total_row_des], repeat(["-"], max(0, N_GROUP_COLS-1)), sum_cols)
- df_out = vcat(df_out,
- DataFrame(permutedims(row_vector)[:, end+1-size(df_out,2):end], names(df_out))
- )
- sum_rows = sum.(skipmissing.(eachrow(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
- col_vector = rename(DataFrame(total = sum_rows), "total" => total_col_des)
- df_out = hcat(df_out, col_vector)
- rename!(df_out, [i => "-"^i for i in 1:N_GROUP_COLS])
-
- #TODO: add a line on top
- # blank for the group_cols
- # name of the wide col
- # total by for the sum col
-
- col_highlighters = vcat(
- map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS),
- [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
- scale = ceil(Int, maximum(skipmissing(df_out[1:end-1, i]))))
- for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
- Highlighter((data, row, col) -> col == size(df_out, 2), crayon"green")
- )
-
- formatters = vcat(
- [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
- [ ft_printf("%d", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
- [ ft_printf("%d", 1+N_GROUP_COLS+N_VAR_COLS) ]
- )
-
- hlines = [1, size(df_out, 1)]
- vlines = [N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
- alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS), [:l])
-
-
- elseif format_stat == :pct
-
- col_highlighters = vcat(
- map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS),
- [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
- scale = ceil(Int, maximum(skipmissing(df_out[:, i]))) )
- for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
- )
-
- formatters = vcat(
- [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
- [ ft_printf("%.1f", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ]
- )
-
- hlines = [1]
- vlines = [0, N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
- alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS))
+ hlines = [1, size(df_out, 1)]
+ vlines = [N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
+ alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS), [:l])
+ elseif format_stat == :pct
- end
+ col_highlighters = Tuple(vcat(
+ map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS),
+ [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
+ scale = ceil(Int, maximum(skipmissing(df_out[:, i]))) )
+ for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
+ ))
+
+ formatters = Tuple(vcat(
+ [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
+ [ ft_printf("%.1f", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ]
+ ))
+
+ hlines = [1]
+ vlines = [0, N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
+ alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS))
- col_highlighters = Tuple(x for x in col_highlighters)
-
- if out ∈ [:stdout, :df]
-
- pretty_table(df_out;
- hlines = hlines,
- vlines = vlines,
- alignment = alignment,
- cell_alignment = reduce(push!,
- map(i -> (i,1)=>:l, 1:N_GROUP_COLS),
- init=Dict{Tuple{Int64, Int64}, Symbol}()),
- formatters = Tuple(formatters),
- highlighters = col_highlighters,
- vcrop_mode = :middle,
- border_crayon = crayon"bold yellow",
- header_crayon = crayon"bold light_green",
- show_header = true,
- show_subheader=false,
- )
-
- if out==:stdout
- return(nothing)
- elseif out==:df
- return(df_out)
- end
- elseif out==:string
- pt = pretty_table(String, df_out;
- hlines = hlines,
- vlines = vlines,
- alignment = alignment,
- cell_alignment = reduce(push!,
- map(i -> (i,1)=>:l, 1:N_GROUP_COLS),
- init=Dict{Tuple{Int64, Int64}, Symbol}()),
- formatters = Tuple(formatters),
- highlighters = col_highlighters,
- crop = :none, # no crop for string output
- border_crayon = crayon"bold yellow",
- header_crayon = crayon"bold light_green",
- show_header = true,
- show_subheader = false,
- )
-
- return(pt)
- end
end
+ cell_alignment = reduce(push!,
+ map(i -> (i,1)=>:l, 1:N_GROUP_COLS),
+ init=Dict{Tuple{Int64, Int64}, Symbol}())
+ return _render_pretty_table(df_out, out;
+ hlines=hlines, vlines=vlines,
+ alignment=alignment, cell_alignment=cell_alignment,
+ formatters=formatters, highlighters=col_highlighters,
+ show_subheader=false)
+end
+
+
+# ----- Unified pretty_table output handler (stdout / df / string)
+function _render_pretty_table(df, out::Symbol; show_subheader=true, pt_kwargs...)
+ common = (
+ border_crayon = crayon"bold yellow",
+ header_crayon = crayon"bold light_green",
+ show_header = true,
+ show_subheader = show_subheader,
+ )
+
+ if out ∈ [:stdout, :df]
+ pretty_table(df; common..., vcrop_mode=:middle, pt_kwargs...)
+ return out == :stdout ? nothing : df
+ else # :string
+ return pretty_table(String, df; common..., crop=:none, pt_kwargs...)
+ end
end
# --------------------------------------------------------------------------------------------------
@@ -395,17 +343,7 @@ end
# --------------------------------------------------------------------------------------------------
# From https://github.com/mbauman/Sparklines.jl/blob/master/src/Sparklines.jl
-# Sparklines.jl
-# const ticks = ['▁','▂','▃','▄','▅','▆','▇','█']
-# function spark(x)
-# min, max = extrema(x)
-# f = div((max - min) * 2^8, length(ticks)-1)
-# f < 1 && (f = one(typeof(f)))
-# idxs = convert(Vector{Int}, map(v -> div(v, f), (x .- min) * 2^8))
-# return string.(ticks[idxs.+1])
-# end
-
-# Unicode characters:
+# Unicode characters:
# █ (Full block, U+2588)
# ⣿ (Full Braille block, U+28FF)
# ▓ (Dark shade, U+2593)
@@ -418,7 +356,7 @@ function text_histogram(frequencies; width=12)
max_freq = maximum(frequencies)
max_freq == 0 && return fill(" " ^ width, length(frequencies))
scale = (width * 8 - 1) / max_freq # Subtract 1 to ensure we don't exceed width
-
+
function bar(f)
units = round(Int, f * scale)
full_blocks = div(units, 8)
@@ -434,7 +372,7 @@ end
# --------------------------------------------------------------------------------------------------
"""
- xtile(data::Vector{T}, n_quantiles::Integer,
+ xtile(data::Vector{T}, n_quantiles::Integer,
weights::Union{Vector{Float64}, Nothing}=nothing)::Vector{Int} where T <: Real
Create quantile groups using Julia's built-in weighted quantile functionality.
@@ -453,11 +391,11 @@ b = xtile(sales, 10, weights=Weights(repeat([1], length(sales))) );
```
"""
function xtile(
- data::AbstractVector{T},
+ data::AbstractVector{T},
n_quantiles::Integer;
weights::Union{Weights{<:Real}, Nothing} = nothing
)::Vector{Int} where T <: Real
-
+
N = length(data)
n_quantiles > N && (@warn "More quantiles than data")
@@ -472,11 +410,11 @@ end
# String version
function xtile(
- data::AbstractVector{T},
+ data::AbstractVector{T},
n_quantiles::Integer;
weights::Union{Weights{<:Real}, Nothing} = nothing
)::Vector{Int} where T <: AbstractString
-
+
if weights === nothing
weights = UnitWeights{Int}(length(data))
end
@@ -486,14 +424,14 @@ function xtile(
sorted_categories = sortperm(category_weights, rev=true)
step = max(1, round(Int, length(sorted_categories) / n_quantiles))
cuts = unique(data)[sorted_categories][1:step:end]
-
+
return searchsortedlast.(Ref(cuts), data)
end
# Dealing with missing and Numbers
function xtile(
- data::AbstractVector{T},
+ data::AbstractVector{T},
n_quantiles::Integer;
weights::Union{Weights{<:Real}, Nothing} = nothing
)::Vector{Union{Int, Missing}} where {T <: Union{Missing, AbstractString, Number}}
@@ -526,4 +464,3 @@ end
-
diff --git a/src/TimeShift.jl b/src/TimeShift.jl
@@ -279,7 +279,3 @@ end
-
-
-
-
diff --git a/src/Winsorize.jl b/src/Winsorize.jl
@@ -1,30 +1,29 @@
# ------------------------------------------------------------------------------------------
"""
- winsorize(
- x::AbstractVector;
- probs::Union{Tuple{Real, Real}, Nothing} = nothing,
- cutpoints::Union{Tuple{Real, Real}, Nothing} = nothing,
- replace::Symbol = :missing
- verbose::Bool=false
- )
+ winsorize(x::AbstractVector;
+ probs=nothing, cutpoints=nothing, replace_value=nothing,
+ IQR=3, verbose=false)
+
+Winsorize (clip extreme values) in a vector.
+Based on Matthieu Gomez's winsorize function in the `statar` R package.
# Arguments
- `x::AbstractVector`: a vector of values
# Keywords
-- `probs::Union{Tuple{Real, Real}, Nothing}`: A vector of probabilities that can be used instead of cutpoints
-- `cutpoints::Union{Tuple{Real, Real}, Nothing}`: Cutpoints under and above which are defined outliers. Default is (median - five times interquartile range, median + five times interquartile range). Compared to bottom and top percentile, this takes into account the whole distribution of the vector
-- `replace_value::Tuple`: Values by which outliers are replaced. Default to cutpoints. A frequent alternative is missing.
-- `IQR::Real`: when inferring cutpoints what is the multiplier from the median for the interquartile range. (median ± IQR * (q75-q25))
-- `verbose::Bool`: printing level
+- `probs::Union{Tuple{Real, Real}, Nothing}`: Probability bounds for cutpoints (e.g., `(0.05, 0.95)`)
+- `cutpoints::Union{Tuple{Real, Real}, Nothing}`: Explicit cutpoints for outlier thresholds.
+ Default is `(median - IQR*(q75-q25), median + IQR*(q75-q25))`
+- `replace_value`: Values to replace outliers with. Default: cutpoint values.
+ Can be a tuple `(lo, hi)`, `missing`, or `(missing, missing)`
+- `IQR::Real=3`: Multiplier from the median for the interquartile range when inferring cutpoints
+- `verbose::Bool=false`: Print informational messages
# Returns
-- `AbstractVector`: A vector the size of x with substituted values
+- `AbstractVector`: A vector the size of x with substituted values
# Examples
- See tests
-
-This code is based on Matthieu Gomez winsorize function in the `statar` R package
"""
function winsorize(x::AbstractVector{T};
probs::Union{Tuple{Real, Real}, Nothing} = nothing,
diff --git a/test/UnitTests/panel_fill.jl b/test/UnitTests/panel_fill.jl
@@ -58,7 +58,7 @@
gap=Month(1), method=:backwards, uniquecheck=true, flag=true)
@test isequal(
select(subset(df3_test, :flag => ByRow(==(:backwards))), r"v"),
- DataFrame(v1 = [1.0, 1.0, 0.0, 4.0, 11.0],
+ DataFrame(v1 = [1.0, 1.0, 0.0, 4.0, 11.0],
v2 = [1.0, 1.0, 4.0, 2.0, 3.0],
v3 = [1.0, 1.0, 15.0, 22.5, 17.2]))
@@ -76,7 +76,7 @@
@test isapprox(
select(subset(df3_test, :flag => ByRow(==(:linear)), skipmissing=true), r"v") ,
DataFrame(
- v1 = [1.0, 1.0, 0.0, 7.5 , 12.0],
+ v1 = [1.0, 1.0, 0.0, 7.5 , 12.0],
v2 = [1.333, 1.666, 4.5, 2.5, 3.5],
v3 = [2.3333, 3.666, 13.625, 19.85, 9.1]),
atol = 0.01)
@@ -88,17 +88,58 @@
select(subset(df3_test, :flag => ByRow(==(:nearest)), skipmissing=true), :v1),
DataFrame(v1 = [1.0, 1.0, 0.0, 11.0, 13.0]))
- # TODO clean up these tests
-
# -- different time periods
- # this fails
- # panel_fill(df3, :id, :t, [:v1, :v2, :v3],
- # gap=Month(2), method=:backwards, uniquecheck=true, flag=true, merge=true)
df3_test = panel_fill(df3, :id, :t, [:v1, :v2, :v3],
gap=Day(10), method=:forwards, uniquecheck=true, flag=true)
@test isequal(nrow(df3_test) , 39)
end
+end
+
+
+@testset "panel_fill - flag=false" begin
+ df = DataFrame(id = [1, 1, 2, 2], t = [1, 3, 1, 4], v = [10, 20, 30, 40])
+ result = panel_fill(df, :id, :t, :v, gap=1, method=:backwards, flag=false)
+ @test !(:flag in names(result))
+ @test nrow(result) > nrow(df) # should have filled rows
+end
+
+
+@testset "panel_fill - invalid method" begin
+ df = DataFrame(id = [1, 1], t = [1, 3], v = [10, 20])
+ @test_throws Exception panel_fill(df, :id, :t, :v, gap=1, method=:invalid_method)
+end
+
+
+@testset "panel_fill - type mismatch" begin
+ # DatePeriod gap with integer time variable
+ df = DataFrame(id = [1, 1], t = [1, 3], v = [10, 20])
+ @test_throws Exception panel_fill(df, :id, :t, :v, gap=Month(1))
+end
+
+
+@testset "panel_fill - non-unique warning" begin
+ df = DataFrame(id = [1, 1, 1], t = [1, 2, 3], v = [10, 20, 30])
+ # non-unique: add a duplicate
+ df_dup = vcat(df, DataFrame(id = [1], t = [2], v = [99]))
+ # should warn about non-unique observations
+ @test_logs (:warn, r"non unique"i) begin
+ try
+ panel_fill(df_dup, :id, :t, :v,
+ gap=1, method=:backwards, uniquecheck=true, flag=true)
+ catch
+ # the function may error after warning due to duplicate handling;
+ # we just verify the warning is emitted
+ end
+ end
+end
+
+@testset "panel_fill - no gaps to fill" begin
+ # consecutive time values, nothing to interpolate
+ df = DataFrame(id = [1, 1, 1], t = [1, 2, 3], v = [10, 20, 30])
+ result = panel_fill(df, :id, :t, :v, gap=1, method=:backwards, flag=true)
+ @test nrow(result) == 3 # no new rows added
+ @test all(result.flag .== :original)
end
diff --git a/test/UnitTests/tabulate.jl b/test/UnitTests/tabulate.jl
@@ -37,9 +37,9 @@
@test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum", "Hist."])
# test the type columns get properly passed
- @test contains(tabulate(df, [:island, :species], group_type = [:type, :value], out=:string),
+ @test contains(tabulate(df, [:island, :species], group_type = [:type, :value], out=:string),
"island_typeof")
- @test contains(tabulate(df, [:island, :species], group_type = [:value, :type], out=:string),
+ @test contains(tabulate(df, [:island, :species], group_type = [:value, :type], out=:string),
"species_typeof")
# test the twoway ad wide tabulate
@@ -58,14 +58,109 @@
# test the group type options
df = DataFrame(x = [1, 2, 2, "NA", missing], y = ["c", "c", "b", "z", "d"])
@test isequal(
- tabulate(df, [:x, :y], out=:df).y,
+ tabulate(df, [:x, :y], out=:df).y,
sort(df.y))
@test nrow(tabulate(df, [:x, :y], group_type = :value, out=:df)) == 5
@test nrow(tabulate(df, [:x, :y], group_type = :type, out=:df)) == 3
- @test nrow(tabulate(df, [:x, :y], group_type = [:type, :value], out=:df)) == 4
+ @test nrow(tabulate(df, [:x, :y], group_type = [:type, :value], out=:df)) == 4
@test nrow(tabulate(df, [:x, :y], group_type = [:value, :type], out=:df)) == 4
end
-# -- TODO: Add tests for results that include missing -
\ No newline at end of file
+@testset "Tabulate - wide format pct" begin
+ df = dropmissing(DataFrame(PalmerPenguins.load()))
+
+ # wide format with format_stat=:pct returns a DataFrame
+ df_pct = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:df)
+ @test df_pct isa DataFrame
+ @test nrow(df_pct) == 3
+ # pct columns should not have a totals column (unlike freq)
+ @test !any(contains.(names(df_pct), "Total"))
+
+ # wide format pct as string output
+ pt = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:string)
+ @test pt isa String
+ @test length(pt) > 0
+
+ # wide format pct stdout returns nothing
+ result = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:stdout)
+ @test isnothing(result)
+end
+
+
+@testset "Tabulate - wide format string output" begin
+ df = dropmissing(DataFrame(PalmerPenguins.load()))
+
+ # wide freq as string
+ pt = tabulate(df, [:island, :species], format_tbl=:wide, out=:string)
+ @test pt isa String
+ @test contains(pt, "Adelie")
+ @test contains(pt, "Gentoo")
+ @test contains(pt, "Chinstrap")
+
+ # 3-column wide as string
+ pt = tabulate(df, [:sex, :island, :species], format_tbl=:wide, out=:string)
+ @test pt isa String
+ @test contains(pt, "Adelie")
+end
+
+
+@testset "Tabulate - missing values" begin
+ # DataFrame with missing values in the tabulated column
+ df = DataFrame(x = [1, 2, missing, 1, missing, 3])
+ df_tab = tabulate(df, :x, out=:df)
+ @test nrow(df_tab) == 4 # 1, 2, 3, missing
+ @test sum(df_tab.freq) == 6
+ @test :freq in propertynames(df_tab)
+ @test :pct in propertynames(df_tab)
+ @test :cum in propertynames(df_tab)
+
+ # string output with missing values should not error
+ pt = tabulate(df, :x, out=:string)
+ @test pt isa String
+ @test contains(pt, "missing")
+
+ # two-column with missing
+ df = DataFrame(x = ["a", "b", missing, "a"], y = [1, 2, 3, missing])
+ df_tab = tabulate(df, [:x, :y], out=:df)
+ @test nrow(df_tab) == 4
+ @test sum(df_tab.freq) == 4
+end
+
+
+@testset "Tabulate - skip_stat vector" begin
+ df = dropmissing(DataFrame(PalmerPenguins.load()))
+
+ # skip multiple stats
+ pt = tabulate(df, :island, out=:string, skip_stat=[:freq_hist, :cum])
+ first_line = split(pt, '\n', limit=2)[1]
+ @test contains(first_line, "Freq")
+ @test contains(first_line, "Percent")
+ @test !contains(first_line, "Cum")
+ @test !contains(first_line, "Hist")
+
+ # skip just freq
+ pt = tabulate(df, :island, out=:string, skip_stat=:freq)
+ first_line = split(pt, '\n', limit=2)[1]
+ @test !contains(first_line, "Freq.")
+ @test contains(first_line, "Percent")
+end
+
+
+@testset "Tabulate - single row DataFrame" begin
+ df = DataFrame(x = ["only_value"])
+ df_tab = tabulate(df, :x, out=:df)
+ @test nrow(df_tab) == 1
+ @test df_tab.freq[1] == 1
+ @test df_tab.cum[1] == 100
+end
+
+
+@testset "Tabulate - reorder_cols=false" begin
+ df = DataFrame(x = ["c", "a", "b", "a", "c", "c"])
+ df_tab = tabulate(df, :x, reorder_cols=false, out=:df)
+ # without reordering, original groupby order is preserved
+ @test nrow(df_tab) == 3
+ @test sum(df_tab.freq) == 6
+end
diff --git a/test/UnitTests/timeshift.jl b/test/UnitTests/timeshift.jl
@@ -30,13 +30,13 @@
sort!(df2, [:id, :t])
transform!(
groupby(df2, :id),
- [:t, :v1] =>
- ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true),
+ [:t, :v1] =>
+ ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true),
v1_lag_mth = tlag(v1, t; n=Month(1), verbose=true) ) ) =>
[:v1_lag_day, :v1_lag_mth])
@test all(ismissing.(df2.v1_lag_day))
- @test isequal(df2.v1_lag_mth,
+ @test isequal(df2.v1_lag_mth,
[missing, missing, missing, 1, missing, 6, missing, missing, missing, missing, missing ])
end
@@ -44,24 +44,24 @@
# --------------------------------------------------------------------------------------------------
- @testset "General tests" begin
+ @testset "General tests" begin
# --- test large datasets
function generate_test_data(;size=50_000, gap_probability=0.1, seed=123)
Random.seed!(seed)
-
+
# Start date and initialize arrays
start_date = Date(2020, 1, 1)
dates = Vector{Date}()
x_values = Vector{Float64}()
-
+
# Generate dates with some gaps and corresponding x values
current_date = start_date
for i in 1:size
# Add current date and value
push!(dates, current_date)
push!(x_values, sin(i/100) + 0.1*randn()) # Some noisy sine wave pattern
-
+
# Decide whether to introduce a gap (skip 1-5 days)
if rand() < gap_probability
gap_size = rand(1:5)
@@ -71,7 +71,7 @@
current_date += Day(1)
end
end
-
+
# Create DataFrame
df = DataFrame(date=dates, x=x_values)
return df
@@ -83,15 +83,15 @@
@time transform!(small_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag)
@test nrow(subset(small_df, :x_lag => ByRow(!ismissing))) == 4525
-
+
@time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day);
@time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth);
@time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr);
-
+
transform!(large_df, :date => ByRow(year) => :datey)
- @test_throws r"time vector not sorted"i transform!(large_df,
+ @test_throws r"time vector not sorted"i transform!(large_df,
[:x, :datey] => ( (x, d) -> tlag(x, d, n=1)) => :x_lag_datey);
-
+
@test nrow(subset(large_df, :x_lag_day => ByRow(!ismissing))) == 900_182
@test nrow(subset(large_df, :x_lag_mth => ByRow(!ismissing))) == 770_178
@test nrow(subset(large_df, :x_lag_yr => ByRow(!ismissing))) == 769_502
@@ -114,7 +114,7 @@
import PanelShift
- # note the api for this package differs slightly ...
+ # note the api for this package differs slightly ...
# PanelShift.tlag(time_variable, x)
# BazelData.tlag(x, time_variable)
@@ -127,11 +127,11 @@
@test isequal(x_shift, [5; 6; missing])
x_shift = tlag([4;5;6], [1;2;3], n=2)
- @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift)
+ @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift)
@test isequal(x_shift, [missing;missing;4])
x_shift = tlead([4;5;6], [1;2;3], n=2)
- @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift)
+ @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift)
@test isequal(x_shift, [6; missing; missing])
# unit-length vector
@@ -143,12 +143,12 @@
@test isequal(PanelShift.tlead([1], [1]), x_shift)
@test isequal(x_shift, [missing])
- # --
+ # --
x_shift = tlag([1;2;3;4;5], [1;3;5;6;7], n=2)
@test isequal(PanelShift.tlag([1;3;5;6;7], [1;2;3;4;5], 2), x_shift)
@test isequal(x_shift, [missing; 1; 2; missing; 3])
- x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2)
+ x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2)
@test isequal(PanelShift.tlag(float.([1;3;5;6;7]), [1;2;3;4;5], 2), x_shift)
@test isequal(x_shift, [missing; 1; 2; missing; 3])
@@ -164,7 +164,7 @@
x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=3)
@test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 3), x_shift)
@test isequal(x_shift, [missing; missing; :apple; :banana; missing])
-
+
x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4)
@test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift)
@@ -174,11 +174,11 @@
@test isequal(PanelShift.tlead([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift)
@test isequal(x_shift, [missing; missing; missing; :strawberry; missing])
- # indexed by dates
+ # indexed by dates
x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(1))
@test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(1)), x_shift)
@test isequal(x_shift, [missing; 1; missing])
-
+
x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(2))
@test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(2)), x_shift)
@test isequal(x_shift, [missing; missing; 2])
@@ -192,75 +192,44 @@
@test isequal(PanelShift.tshift([1;2;3], [1;2;3], 1), x_shift)
@test isequal(x_shift, tlag([1;2;3], [1;2;3], n=1))
- # safeguards
- # @test_throws ArgumentError PanelShift.tlag([1;2;2], [1,2,3]) # argcheck error unsorted t
- @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2])
- # @test_throws ArgumentError PanelShift.tlag([1;2;], [1,2,3])
- @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3])
- # @test_throws ArgumentError PanelShift.tlag([1;2;3], [1,2,3], 0)
- @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0)
+ # safeguards for tlag
+ @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2])
+ @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3])
+ @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0)
- end
+ end
# --------------------------------------------------------------------------------------------------
-
# --------------------------------------------------------------------------------------------------
-# benchmarking
-
-# using Chairmarks
-# large_df = generate_test_data(size=50_000_000, gap_probability=0.1);
-
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr)
+ @testset "tlead error paths" begin
+ # unsorted time vector
+ @test_throws r"time vector not sorted"i tlead([1, 2, 3], [3, 1, 2])
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x)) => :x_lag_day)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Month(1))) => :x_lag_mth)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Year(1))) => :x_lag_yr)
-
-
-
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d)) => :x_lag_day)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Month(1))) => :x_lag_mth)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Year(1))) => :x_lag_yr)
-
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x)) => :x_lag_day)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Month(1))) => :x_lag_mth)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Year(1))) => :x_lag_yr)
-
-# --------------------------------------------------------------------------------------------------
+ # mismatched lengths
+ @test_throws r"value and time vector"i tlead([1, 2], [1, 2, 3])
+ # zero shift
+ @test_throws r"shift value"i tlead([1, 2, 3], [1, 2, 3], n=0)
+ end
+# --------------------------------------------------------------------------------------------------
+# --------------------------------------------------------------------------------------------------
+ @testset "tshift edge cases" begin
+ # tshift with n=nothing should warn and default to lag
+ result = @test_logs (:warn, r"shift not specified"i) tshift([1, 2, 3], [1, 2, 3])
+ @test isequal(result, tlag([1, 2, 3], [1, 2, 3]))
+
+ # tshift with Date vectors
+ dates = [Date(2020, 1, 1), Date(2020, 1, 2), Date(2020, 1, 3)]
+ result = tshift([10, 20, 30], dates, n=Day(1))
+ @test isequal(result, tlag([10, 20, 30], dates, n=Day(1)))
+
+ result = tshift([10, 20, 30], dates, n=Day(-1))
+ @test isequal(result, tlead([10, 20, 30], dates, n=Day(1)))
+ end
+# --------------------------------------------------------------------------------------------------
end
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/test/UnitTests/winsorize.jl b/test/UnitTests/winsorize.jl
@@ -1,13 +1,13 @@
@testset "winsorize" begin
- Random.seed!(3);
+ Random.seed!(3);
x1 = rand(100);
x2 = Vector{Union{Float64, Missing}}(rand(Float64, 100)); x2[rand(collect(1:100), 5)] .= missing;
# --- tests on non-missing vectors
x1_win = winsorize(x1, probs=(0.05, 0.95), verbose=true);
@test findall(x1 .!= x1_win) == [4, 15, 26, 32, 40, 44, 52, 59, 64, 97]
-
+
x1_win = winsorize(x1; verbose=true);
@test findall(x1 .!= x1_win) == []
@@ -21,7 +21,7 @@
x2_win = winsorize(x2, probs=(0.02, 0.98), verbose=true);
@test size(x2) == size(x2_win)
@test findall(skipmissing(x2 .!= x2_win)) == [5, 41, 83, 91]
-
+
x2_win = winsorize(x2; verbose=true)
@test size(x2) == size(x2_win)
@test findall(skipmissing(x2 .!= x2_win)) == []
@@ -43,12 +43,69 @@
@test size(x2) == size(x2_win)
@test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
- # we check that this works if the type of replace is slightly different ...
+ # we check that this works if the type of replace is slightly different ...
# maybe we want to change this ...
x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(-1, 1), verbose=true)
@test size(x2) == size(x2_win)
@test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
+end
+
+
+@testset "winsorize - custom IQR" begin
+ Random.seed!(42)
+ x = randn(1000) # standard normal: outliers likely beyond ~3σ
+
+ # default IQR=3 should keep most data
+ w_default = winsorize(x)
+ n_changed_default = count(x .!= w_default)
+
+ # IQR=1 should clip more aggressively
+ w_tight = winsorize(x, IQR=1)
+ n_changed_tight = count(x .!= w_tight)
+ @test n_changed_tight > n_changed_default
+
+ # IQR=100 should clip almost nothing
+ w_loose = winsorize(x, IQR=100)
+ @test count(x .!= w_loose) == 0
+end
+
+
+@testset "winsorize - edge cases" begin
+ # all identical values: nothing to winsorize
+ x_same = fill(5.0, 50)
+ w = winsorize(x_same, probs=(0.05, 0.95))
+ @test w == x_same
+
+ # single-element vector
+ x_one = [3.14]
+ w = winsorize(x_one, probs=(0.1, 0.9))
+ @test w == x_one
+
+ # integer vector
+ x_int = collect(1:100)
+ w = winsorize(x_int, probs=(0.05, 0.95))
+ @test length(w) == 100
+ @test minimum(w) >= minimum(x_int)
+ @test maximum(w) <= maximum(x_int)
+ @test count(w .!= x_int) > 0 # some values should be clipped
+
+ # one-sided winsorize: only clip top
+ Random.seed!(1)
+ x = rand(100)
+ w = winsorize(x, cutpoints=(minimum(x), 0.5))
+ @test minimum(w) == minimum(x) # bottom unchanged
+ @test maximum(w) <= 0.5
+
+ # one-sided: only clip bottom
+ w = winsorize(x, cutpoints=(0.5, maximum(x)))
+ @test minimum(w) >= 0.5
+ @test maximum(w) == maximum(x) # top unchanged
+end
+@testset "winsorize - all missing" begin
+ x_all_missing = Vector{Union{Float64, Missing}}(fill(missing, 10))
+ # probs path uses skipmissing which will be empty - quantile on empty should error
+ @test_throws Exception winsorize(x_all_missing, probs=(0.05, 0.95))
end
diff --git a/test/UnitTests/xtile.jl b/test/UnitTests/xtile.jl
@@ -51,5 +51,51 @@
@test isequal(xtile(s_m, 3), [1, 1, 2, missing, 1, missing, 3])
@test isequal(xtile(s_m, 20), [1, 2, 4, missing, 2, missing, 5])
+end
-end-
\ No newline at end of file
+
+@testset "xtile - edge cases" begin
+
+ # all-missing input
+ x_all_missing = Vector{Union{Int64, Missing}}(fill(missing, 10))
+ result = xtile(x_all_missing, 4)
+ @test all(ismissing, result)
+ @test length(result) == 10
+
+ # single-element vector: searchsortedlast puts the value at the last quantile
+ result = xtile([42.0], 5)
+ @test length(result) == 1
+ @test result[1] isa Int
+
+ result = xtile([42], 5)
+ @test length(result) == 1
+
+ result = xtile(["hello"], 3)
+ @test length(result) == 1
+
+ # single-element with missing wrapper
+ x_single_m = Union{Int, Missing}[7]
+ result = xtile(x_single_m, 3)
+ @test length(result) == 1
+ @test !ismissing(result[1])
+
+ # two-element vector: results should be valid bin indices
+ result = xtile([1.0, 2.0], 2)
+ @test length(result) == 2
+ @test result[1] < result[2] # lower value gets lower bin
+
+ # all identical values: all should get the same bin
+ x_same = fill(5.0, 100)
+ result = xtile(x_same, 10)
+ @test allequal(result)
+ @test length(result) == 100
+
+ # n_quantiles == 1: only the max element gets bin 1 (searchsortedlast behavior)
+ result = xtile(rand(50), 1)
+ @test all(r -> r in (0, 1), result)
+
+ # large n_quantiles: bins are bounded by n_quantiles
+ result = xtile(rand(100), 10)
+ @test all(r -> 0 <= r <= 10, result)
+
+end