refactor, fix bugs, and expand test coverage across all functions - BazerData.jl

commit 4fcfd6f3198b6ce57d580193cb6e270a6a83f451
parent f1b1519c96366d69335ed5229febe6fcd00e1dec
Author: Erik Loualiche <[email protected]>
Date:   Mon, 16 Feb 2026 13:28:36 -0600

refactor, fix bugs, and expand test coverage across all functions

- tabulate: extract into _tabulate_compute, _tabulate_render_long,
  _tabulate_render_wide, _render_pretty_table helpers; fix typeof
  patterns to use isa; fix skip_stat bug using cols instead of
  new_cols; fix skip_stat highlighter column index mismatch
- panel_fill: fix hardcoded df.t to df[!, time_var]; fix typeof
  pattern; remove ghost merge param from docstring and docs examples
- winsorize: fix docstring to match actual replace_value kwarg
- timeshift: trim trailing blank lines
- docs: fix "not yet registered" text; remove invalid merge=true
- tests: 112 → 185 tests (+65%) with new edge case, error path,
  missing data, and feature combination coverage

Co-Authored-By: Claude Opus 4.6 <[email protected]>

Diffstat:
M docs/src/index.md  | 12 ++++++------
M src/PanelData.jl  | 24 +++++++++++-------------
M src/StataUtils.jl  | 431 ++++++++++++++++++++++++++++++++++---------------------------------------------
M src/TimeShift.jl  | 4 ----
M src/Winsorize.jl  | 29 ++++++++++++++---------------
M test/UnitTests/panel_fill.jl  | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-------
M test/UnitTests/tabulate.jl  | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M test/UnitTests/timeshift.jl  | 129 ++++++++++++++++++++++++++++++-------------------------------------------------
M test/UnitTests/winsorize.jl  | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M test/UnitTests/xtile.jl  | 49 +++++++++++++++++++++++++++++++++++++++++++++++--

10 files changed, 520 insertions(+), 384 deletions(-)
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -15,11 +15,11 @@ The readme serves as documentation; there might be more examples inside of the t
 
 ## Installation
 
-`BazerData.jl` is a not yet a registered package.
-You can install it from github  via
+`BazerData.jl` is a registered package.
+You can install it via
 ```julia
 import Pkg
-Pkg.add(url="https://github.com/eloualiche/BazerData.jl")
+Pkg.add("BazerData")
 ```
 
 
@@ -96,11 +96,11 @@ df_panel = DataFrame(        # missing t=2 for id=1
     v3 = [1,5,4,6,6,15,12.25, 21,22.5,17.2,1])
 
 panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
-    gap=Month(1), method=:backwards, uniquecheck=true, flag=true, merge=true)
+    gap=Month(1), method=:backwards, uniquecheck=true, flag=true)
 panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
-    gap=Month(1), method=:forwards, uniquecheck=true, flag=true, merge=true)
+    gap=Month(1), method=:forwards, uniquecheck=true, flag=true)
 panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
-    gap=Month(1), method=:linear, uniquecheck=true, flag=true, merge=true)
+    gap=Month(1), method=:linear, uniquecheck=true, flag=true)
 ```
 
 ### Leads and lags
diff --git a/src/PanelData.jl b/src/PanelData.jl
@@ -2,14 +2,13 @@
 """
     panel_fill!(
         df::DataFrame,
-        id_var::Symbol, 
-        time_var::Symbol, 
+        id_var::Symbol,
+        time_var::Symbol,
         value_var::Union{Symbol, Vector{Symbol}};
-        gap::Union{Int, DatePeriod} = 1, 
-        method::Symbol = :backwards, 
+        gap::Union{Int, DatePeriod} = 1,
+        method::Symbol = :backwards,
         uniquecheck::Bool = true,
-        flag::Bool = false,
-        merge::Bool = false
+        flag::Bool = false
     )
 
 # Arguments
@@ -17,17 +16,16 @@
 - `id_var::Symbol`: the individual index dimension of the panel
 - `time_var::Symbol`: the time index dimension of the panel (must be integer or a date)
 - `value_var::Union{Symbol, Vector{Symbol}}`: the set of columns we would like to fill
-        
+
 # Keywords
 - `gap::Union{Int, DatePeriod} = 1` : the interval size for which we want to fill data
 - `method::Symbol = :backwards`: the interpolation method to fill the data
     options are: `:backwards` (default), `:forwards`, `:linear`, `:nearest`
-    email me for other interpolations (anything from Interpolations.jl is possible)
 - `uniquecheck::Bool = true`: check if panel is clean
 - `flag::Bool = false`: flag the interpolated values
 
 # Returns
-- `AbstractDataFrame`: 
+- `DataFrame`: the input DataFrame with interpolated rows appended
 
 # Examples
 - See tests
@@ -43,7 +41,7 @@ function panel_fill!(
  
     # prepare the data
     sort!(df, [id_var, time_var])
-    if isa(value_var, Symbol) 
+    if value_var isa Symbol
         value_var = [value_var]
     end
     if uniquecheck # check for unicity 
@@ -52,11 +50,11 @@ function panel_fill!(
     end
 
     time_var_r = join([string(time_var), "rounded"], "_") # clean up if dates
-    if typeof(gap) <: DatePeriod
-        if !(eltype(df.t) <: Dates.AbstractTime)
+    if gap isa DatePeriod
+        if !(eltype(df[!, time_var]) <: Dates.AbstractTime)
             error(
                 """
-                Type of gap $(typeof(gap)) and type of time variable $(eltype(df.t)) do not match
+                Type of gap $(typeof(gap)) and type of time variable $(eltype(df[!, time_var])) do not match
                 """
             )
         else
diff --git a/src/StataUtils.jl b/src/StataUtils.jl
@@ -9,7 +9,7 @@
 
 # ------------------------------------------------------------------------------------------
 # List of exported functions
-# tabulate # (tab alias)
+# tabulate
 # xtile
 # ------------------------------------------------------------------------------------------
 
@@ -19,7 +19,8 @@
     tabulate(df::AbstractDataFrame, cols::Union{Symbol, Array{Symbol}};
         reorder_cols=true, out::Symbol=:stdout)
 
-This was forked from TexTables.jl and was inspired by https://github.com/matthieugomez/statar
+Frequency tabulation inspired by Stata's `tabulate` command.
+Forked from TexTables.jl and inspired by https://github.com/matthieugomez/statar
 
 # Arguments
 - `df::AbstractDataFrame`: Input DataFrame to analyze
@@ -37,7 +38,6 @@ This was forked from TexTables.jl and was inspired by https://github.com/matthie
     - `:df`  Return the result as a DataFrame
     - `:string` Return the formatted table as a string
 
-
 # Returns
 - `Nothing` if `out=:stdout`
 - `DataFrame` if `out=:df`
@@ -50,9 +50,6 @@ The resulting table contains the following columns:
 - `pct`: Percentage of total
 - `cum`: Cumulative percentage
 
-# TO DO
-allow user to specify order of columns (reorder = false flag)
-
 # Examples
 See the README for more examples
 ```julia
@@ -74,20 +71,15 @@ function tabulate(
     df::AbstractDataFrame, cols::Union{Symbol, Vector{Symbol}};
     group_type::Union{Symbol, Vector{Symbol}}=:value,
     reorder_cols::Bool=true,
-    format_tbl::Symbol=:long, 
+    format_tbl::Symbol=:long,
     format_stat::Symbol=:freq,
     skip_stat::Union{Nothing, Symbol, Vector{Symbol}}=nothing,
     out::Symbol=:stdout)
 
-    if typeof(cols) <: Symbol # check if it's an array or just a point
-        N_COLS = 1
-    else
-        N_COLS = size(cols,1)
-        # error("Only accepts one variable for now ...")
-    end
+    N_COLS = cols isa Symbol ? 1 : length(cols)
 
     if !(format_tbl ∈ [:long, :wide])
-        if size(cols, 1) == 1
+        if N_COLS == 1
             @warn "Converting format_tbl to :long"
             format_tbl = :long
         else
@@ -100,7 +92,18 @@ function tabulate(
         return nothing
     end
 
-    # Count the number of observations by `columns`: this is the main calculation
+    df_out, new_cols = _tabulate_compute(df, cols, group_type, reorder_cols)
+
+    if format_tbl == :long
+        return _tabulate_render_long(df_out, new_cols, N_COLS, out, skip_stat)
+    else  # :wide
+        return _tabulate_render_wide(df_out, new_cols, N_COLS, format_stat, out)
+    end
+end
+
+
+# ----- Computation: groupby, combine, sort, pct/cum transforms
+function _tabulate_compute(df, cols, group_type, reorder_cols)
     group_type_error_msg = """
         \ngroup_type input must specify either ':value' or ':type' for columns;
         options are :value, :type, or a vector combining the two;
@@ -114,8 +117,7 @@ function tabulate(
         df_out = transform(df, cols .=> ByRow(typeof) .=> name_type_cols) |>
             (d -> combine(groupby(d, name_type_cols), nrow => :freq, proprow =>:pct))
         new_cols = name_type_cols
-        # rename!(df_out, name_type_cols .=> cols)
-    elseif typeof(group_type) <: Vector{Symbol}
+    elseif group_type isa Vector{Symbol}
         !all(s -> s in [:value, :type], group_type) && (@error group_type_error_msg)
         (size(group_type, 1) != size(cols, 1)) &&
             (@error "\ngroup_type and cols must be the same size; \nsee help for more information")
@@ -129,243 +131,189 @@ function tabulate(
         @error group_type_error_msg
     end
     # resort columns based on the original order
-    new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols, 
+    new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols,
         by= x -> findfirst(==(replace(string(x), r"_typeof$" => "")), string.(cols)) )
 
     if reorder_cols
-        cols_sortable = [ # check whether it makes sense to sort on the variables
+        cols_sortable = [
             name
             for (name, col) in pairs(eachcol(select(df_out, new_cols)))
             if eltype(col) |> t -> hasmethod(isless, Tuple{t,t})
         ]
-        if size(cols_sortable, 1)>0
-            cols_sortable
+        if !isempty(cols_sortable)
             sort!(df_out, cols_sortable)  # order before we build cumulative
         end
     end
     transform!(df_out, :pct => cumsum => :cum, :freq => ByRow(Int) => :freq)
-    # easier to do some of the transformations on the numbers directly than using formatters
-    transform!(df_out, 
-        :pct => (x -> x .* 100), 
+    transform!(df_out,
+        :pct => (x -> x .* 100),
         :cum => (x -> Int.(round.(x .* 100, digits=0))), renamecols=false)
 
+    return df_out, new_cols
+end
 
 
+# ----- Long format rendering
+function _tabulate_render_long(df_out, new_cols, N_COLS, out, skip_stat)
+    transform!(df_out, :freq => (x->text_histogram(x, width=24)) => :freq_hist)
+
+    # highlighter with gradient for the freq/pct/cum columns (rest is cyan)
+    col_highlighters = Tuple(vcat(
+        map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_COLS),
+        hl_custom_gradient(cols=(N_COLS+1), colorscheme=:Oranges_9, scale=maximum(df_out.freq)),
+        hl_custom_gradient(cols=(N_COLS+2), colorscheme=:Greens_9,  scale=ceil(Int, maximum(df_out.pct))),
+        hl_custom_gradient(cols=(N_COLS+3), colorscheme=:Greens_9, scale=100),
+    ))
+
+    # when skip_stat is provided and output is string, filter columns
+    if out == :string && !isnothing(skip_stat)
+        all_stats = [:freq, :pct, :cum, :freq_hist]
+        skip_list = skip_stat isa Vector ? skip_stat : [skip_stat]
+        col_stat = setdiff(all_stats, skip_list)
+        N_COL_STAT = length(col_stat)
+
+        stat_headers = Dict(:freq=>"Freq.", :pct=>"Percent", :cum=>"Cum", :freq_hist=>"Hist.")
+        stat_formats = Dict(:freq=>"%d", :pct=>"%.1f", :cum=>"%d", :freq_hist=>"%s")
+        stat_colorschemes = Dict(
+            :freq => (:Oranges_9, maximum(df_out.freq)),
+            :pct  => (:Greens_9, ceil(Int, maximum(df_out.pct))),
+            :cum  => (:Greens_9, 100),
+        )
 
-# ----- prepare the table
-    if format_tbl == :long
+        header = vcat(string.(new_cols),
+            [stat_headers[k] for k in col_stat])
+        formatters = Tuple(vcat(
+            [ft_printf("%s", i) for i in 1:N_COLS],
+            [ft_printf(stat_formats[k], N_COLS + i) for (i, k) in enumerate(col_stat)]
+        ))
+        # rebuild highlighters for the filtered column layout
+        filtered_highlighters = Tuple(vcat(
+            map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_COLS),
+            [haskey(stat_colorschemes, k) ?
+                hl_custom_gradient(cols=N_COLS+i, colorscheme=stat_colorschemes[k][1], scale=stat_colorschemes[k][2]) :
+                Highlighter((data, row, col) -> col == N_COLS+i, crayon"white")
+             for (i, k) in enumerate(col_stat)]
+        ))
+        alignment = vcat(repeat([:l], N_COLS), repeat([:c], N_COL_STAT))
+        cell_alignment = reduce(push!,
+            map(i -> (i,1)=>:l, 1:N_COLS+N_COL_STAT-1),
+            init=Dict{Tuple{Int64, Int64}, Symbol}())
+
+        df_render = select(df_out, new_cols, col_stat)
+        return _render_pretty_table(df_render, out;
+            hlines=[1], vlines=[N_COLS],
+            alignment=alignment, cell_alignment=cell_alignment,
+            header=header, formatters=formatters, highlighters=filtered_highlighters)
+    end
 
-        transform!(df_out, :freq => (x->text_histogram(x, width=24)) => :freq_hist)
+    # default: all stat columns
+    header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."]
+    formatters = Tuple(vcat(
+        [ft_printf("%s", i) for i in 1:N_COLS],
+        [ft_printf("%d", N_COLS+1), ft_printf("%.1f", N_COLS+2),
+         ft_printf("%d", N_COLS+3), ft_printf("%s", N_COLS+4)]
+    ))
+    alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c)
+    cell_alignment = reduce(push!,
+        map(i -> (i,1)=>:l, 1:N_COLS+3),
+        init=Dict{Tuple{Int64, Int64}, Symbol}())
+
+    return _render_pretty_table(df_out, out;
+        hlines=[1], vlines=[N_COLS],
+        alignment=alignment, cell_alignment=cell_alignment,
+        header=header, formatters=formatters, highlighters=col_highlighters)
+end
 
-        # highlighter with gradient for the freq/pct/cum columns (rest is blue)
-        col_highlighters = vcat(
-            map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_COLS),
-            hl_custom_gradient(cols=(N_COLS+1), colorscheme=:Oranges_9, scale=maximum(df_out.freq)),
-            hl_custom_gradient(cols=(N_COLS+2), colorscheme=:Greens_9,  scale=ceil(Int, maximum(df_out.pct))),
-            hl_custom_gradient(cols=(N_COLS+3), colorscheme=:Greens_9, scale=100),
-        )
-        col_highlighters = Tuple(x for x in col_highlighters)
-
-        col_formatters = Tuple(vcat( 
-            [ ft_printf("%s", i) for i in 1:N_COLS ],   # Column values
-            [ 
-                ft_printf("%d", N_COLS+1),   # Frequency (integer)
-                ft_printf("%.1f", N_COLS+2),  
-                ft_printf("%d", N_COLS+3), # Cumulative
-                ft_printf("%s", N_COLS+4)    # Histogram
-            ]
+
+# ----- Wide format rendering
+function _tabulate_render_wide(df_out, new_cols, N_COLS, format_stat, out)
+    df_out = unstack(df_out,
+        new_cols[1:(N_COLS-1)], new_cols[N_COLS], format_stat,
+        allowmissing=true)
+
+    N_GROUP_COLS = N_COLS - 1
+    N_VAR_COLS   = size(df_out, 2) - N_GROUP_COLS
+
+    if format_stat == :freq
+
+        # frequency: add row and column totals
+        total_row_des = "Total by $(string(new_cols[N_COLS]))"
+        total_col_des = join(vcat("Total by ", join(string.(new_cols[1:(N_COLS-1)]), ", ")))
+
+        sum_cols = sum.(skipmissing.(eachcol(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
+        row_vector = vcat([total_row_des], repeat(["-"], max(0, N_GROUP_COLS-1)), sum_cols)
+        df_out = vcat(df_out,
+            DataFrame(permutedims(row_vector)[:, end+1-size(df_out,2):end], names(df_out)))
+        sum_rows = sum.(skipmissing.(eachrow(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
+        col_vector = rename(DataFrame(total = sum_rows), "total" => total_col_des)
+        df_out = hcat(df_out, col_vector)
+        rename!(df_out, [i => "-"^i for i in 1:N_GROUP_COLS])
+
+        col_highlighters = Tuple(vcat(
+            map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS),
+            [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
+                    scale = ceil(Int, maximum(skipmissing(df_out[1:end-1, i]))))
+              for i in  range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
+            Highlighter((data, row, col) -> col == size(df_out, 2), crayon"green")
         ))
 
-        if out ∈ [:stdout, :df]
-
-            pretty_table(df_out;
-                hlines = [1],
-                vlines = [N_COLS],
-                alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c),
-                cell_alignment = reduce(push!,
-                    map(i -> (i,1)=>:l, 1:N_COLS+3),
-                    init=Dict{Tuple{Int64, Int64}, Symbol}()),
-                header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."],
-                formatters =  col_formatters,
-                highlighters = col_highlighters,
-                vcrop_mode = :middle,
-                border_crayon = crayon"bold yellow",
-                header_crayon = crayon"bold light_green",
-                show_header = true,
-            )
-
-            if out==:stdout
-                return(nothing)
-            elseif out==:df
-                return(df_out)
-            end
-
-        elseif out==:string # this might be costly as I am regenerating the table.
-            if isnothing(skip_stat)
-                pt = pretty_table(String, df_out;
-                    hlines = [1],
-                    vlines = [N_COLS],
-                    alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c),
-                    cell_alignment = reduce(push!,
-                        map(i -> (i,1)=>:l, 1:N_COLS+3),
-                        init=Dict{Tuple{Int64, Int64}, Symbol}()),
-                    header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."],
-                    formatters =  col_formatters,
-                    highlighters = col_highlighters,
-                    crop = :none, # no crop for string output
-                    border_crayon = crayon"bold yellow",
-                    header_crayon = crayon"bold light_green",
-                    show_header = true,
-                )
-            else 
-                col_stat = setdiff([:freq, :pct, :cum, :freq_hist], 
-                                   isa(skip_stat, Vector) ? skip_stat : [skip_stat])
-                N_COL_STAT = size(col_stat,1)
-                header_table = vcat(string.(new_cols), 
-                    [Dict(:freq=>"Freq.", :pct=>"Percent", :cum=>"Cum", :freq_hist=>"Hist.")[k]
-                     for k in col_stat]
-                    )
-                df_sub_out = select(df_out, cols, col_stat)
-                pt = pretty_table(String, df_sub_out;
-                    hlines = [1],
-                    vlines = [N_COLS],
-                    alignment = vcat(repeat([:l], N_COLS), repeat([:c], N_COL_STAT)),
-                    cell_alignment = reduce(push!,
-                        map(i -> (i,1)=>:l, 1:N_COLS+N_COL_STAT-1),
-                        init=Dict{Tuple{Int64, Int64}, Symbol}()),
-                    header = header_table,
-                    formatters =  col_formatters,
-                    highlighters = col_highlighters,
-                    crop = :none, # no crop for string output
-                    border_crayon = crayon"bold yellow",
-                    header_crayon = crayon"bold light_green",
-                    show_header = true,
-                )
-            end                
-
-            return(pt)
-        end
+        formatters = Tuple(vcat(
+            [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
+            [ ft_printf("%d", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
+            [ ft_printf("%d", 1+N_GROUP_COLS+N_VAR_COLS) ]
+        ))
 
-    elseif format_tbl == :wide 
-
-        df_out = unstack(df_out, 
-            new_cols[1:(N_COLS-1)], new_cols[N_COLS], format_stat, 
-            allowmissing=true)
-        # new_cols[1:(N_COLS-1)] might be more than one category
-        # new_cols[N_COLS] only one group!
-
-        N_GROUP_COLS = N_COLS - 1 # the first set of category (on the left!)
-        N_VAR_COLS   = size(df_out, 2) - N_GROUP_COLS
-
-
-        if format_stat == :freq
-
-            # frequency we also show totals            
-            total_row_des = "Total by $(string(new_cols[N_COLS]))"
-            total_col_des = join(vcat("Total by ", join(string.(new_cols[1:(N_COLS-1)]), ", ")))
-
-            sum_cols = sum.(skipmissing.(eachcol(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
-            row_vector = vcat([total_row_des], repeat(["-"], max(0, N_GROUP_COLS-1)), sum_cols)                        
-            df_out = vcat(df_out, 
-                DataFrame(permutedims(row_vector)[:, end+1-size(df_out,2):end], names(df_out))
-                )
-            sum_rows = sum.(skipmissing.(eachrow(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)])))
-            col_vector = rename(DataFrame(total = sum_rows), "total" => total_col_des)
-            df_out = hcat(df_out, col_vector)
-            rename!(df_out, [i => "-"^i for i in 1:N_GROUP_COLS])
-
-            #TODO: add a line on top
-            #   blank for the group_cols 
-            #   name of the wide col 
-            #   total by for the sum col
-
-            col_highlighters = vcat(
-                map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS),
-                [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
-                        scale = ceil(Int, maximum(skipmissing(df_out[1:end-1, i]))))
-                  for i in  range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
-                Highlighter((data, row, col) -> col == size(df_out, 2), crayon"green")
-            )
-           
-            formatters = vcat( 
-                [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
-                [ ft_printf("%d", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
-                [ ft_printf("%d", 1+N_GROUP_COLS+N_VAR_COLS) ]
-                )
-
-            hlines = [1, size(df_out, 1)]
-            vlines = [N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
-            alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS), [:l])
-
-
-        elseif format_stat == :pct
-
-            col_highlighters = vcat(
-                map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS),
-                [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
-                        scale = ceil(Int, maximum(skipmissing(df_out[:, i]))) )
-                  for i in  range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
-            )
-
-            formatters = vcat( 
-                [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
-                [ ft_printf("%.1f", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ]
-                )
-
-            hlines = [1]
-            vlines = [0, N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
-            alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS))
+        hlines = [1, size(df_out, 1)]
+        vlines = [N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
+        alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS), [:l])
 
+    elseif format_stat == :pct
 
-        end
+        col_highlighters = Tuple(vcat(
+            map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS),
+            [ hl_custom_gradient(cols=i, colorscheme=:Greens_9,
+                    scale = ceil(Int, maximum(skipmissing(df_out[:, i]))) )
+              for i in  range(1+N_GROUP_COLS; length=N_VAR_COLS) ],
+        ))
+
+        formatters = Tuple(vcat(
+            [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ],
+            [ ft_printf("%.1f", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ]
+        ))
+
+        hlines = [1]
+        vlines = [0, N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS]
+        alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS))
 
-        col_highlighters = Tuple(x for x in col_highlighters)
-
-        if out ∈ [:stdout, :df]
-
-            pretty_table(df_out;
-                hlines = hlines,
-                vlines = vlines,
-                alignment = alignment,
-                cell_alignment = reduce(push!,
-                    map(i -> (i,1)=>:l, 1:N_GROUP_COLS),
-                    init=Dict{Tuple{Int64, Int64}, Symbol}()),
-                formatters = Tuple(formatters),
-                highlighters = col_highlighters,
-                vcrop_mode = :middle,
-                border_crayon = crayon"bold yellow",
-                header_crayon = crayon"bold light_green",
-                show_header = true,
-                show_subheader=false,
-            )
-
-            if out==:stdout
-                return(nothing)
-            elseif out==:df
-                return(df_out)
-            end
-        elseif out==:string            
-            pt = pretty_table(String, df_out;
-                hlines = hlines,
-                vlines = vlines,
-                alignment = alignment,
-                cell_alignment = reduce(push!,
-                    map(i -> (i,1)=>:l, 1:N_GROUP_COLS),
-                    init=Dict{Tuple{Int64, Int64}, Symbol}()),
-                formatters = Tuple(formatters),
-                highlighters = col_highlighters,
-                crop = :none, # no crop for string output
-                border_crayon = crayon"bold yellow",
-                header_crayon = crayon"bold light_green",
-                show_header = true,
-                show_subheader = false,
-            )
-
-            return(pt)
-        end
     end
 
+    cell_alignment = reduce(push!,
+        map(i -> (i,1)=>:l, 1:N_GROUP_COLS),
+        init=Dict{Tuple{Int64, Int64}, Symbol}())
 
+    return _render_pretty_table(df_out, out;
+        hlines=hlines, vlines=vlines,
+        alignment=alignment, cell_alignment=cell_alignment,
+        formatters=formatters, highlighters=col_highlighters,
+        show_subheader=false)
+end
+
+
+# ----- Unified pretty_table output handler (stdout / df / string)
+function _render_pretty_table(df, out::Symbol; show_subheader=true, pt_kwargs...)
+    common = (
+        border_crayon = crayon"bold yellow",
+        header_crayon = crayon"bold light_green",
+        show_header = true,
+        show_subheader = show_subheader,
+    )
+
+    if out ∈ [:stdout, :df]
+        pretty_table(df; common..., vcrop_mode=:middle, pt_kwargs...)
+        return out == :stdout ? nothing : df
+    else  # :string
+        return pretty_table(String, df; common..., crop=:none, pt_kwargs...)
+    end
 end
 # --------------------------------------------------------------------------------------------------
 
@@ -395,17 +343,7 @@ end
 
 # --------------------------------------------------------------------------------------------------
 # From https://github.com/mbauman/Sparklines.jl/blob/master/src/Sparklines.jl
-# Sparklines.jl
-# const ticks = ['▁','▂','▃','▄','▅','▆','▇','█']
-# function spark(x)
-#     min, max = extrema(x)
-#     f = div((max - min) * 2^8, length(ticks)-1)
-#     f < 1 && (f = one(typeof(f)))
-#     idxs = convert(Vector{Int}, map(v -> div(v, f), (x .- min) * 2^8))
-#     return string.(ticks[idxs.+1])
-# end
-
-# Unicode characters: 
+# Unicode characters:
 # █ (Full block, U+2588)
 # ⣿ (Full Braille block, U+28FF)
 # ▓ (Dark shade, U+2593)
@@ -418,7 +356,7 @@ function text_histogram(frequencies; width=12)
     max_freq = maximum(frequencies)
     max_freq == 0 && return fill(" " ^ width, length(frequencies))
     scale = (width * 8 - 1) / max_freq  # Subtract 1 to ensure we don't exceed width
-    
+
     function bar(f)
         units = round(Int, f * scale)
         full_blocks = div(units, 8)
@@ -434,7 +372,7 @@ end
 # --------------------------------------------------------------------------------------------------
 
 """
-    xtile(data::Vector{T}, n_quantiles::Integer, 
+    xtile(data::Vector{T}, n_quantiles::Integer,
                  weights::Union{Vector{Float64}, Nothing}=nothing)::Vector{Int} where T <: Real
 
 Create quantile groups using Julia's built-in weighted quantile functionality.
@@ -453,11 +391,11 @@ b = xtile(sales, 10, weights=Weights(repeat([1], length(sales))) );
 ```
 """
 function xtile(
-    data::AbstractVector{T}, 
+    data::AbstractVector{T},
     n_quantiles::Integer;
     weights::Union{Weights{<:Real}, Nothing} = nothing
 )::Vector{Int} where T <: Real
-    
+
         N = length(data)
         n_quantiles > N && (@warn "More quantiles than data")
 
@@ -472,11 +410,11 @@ end
 
 # String version
 function xtile(
-    data::AbstractVector{T}, 
+    data::AbstractVector{T},
     n_quantiles::Integer;
     weights::Union{Weights{<:Real}, Nothing} = nothing
 )::Vector{Int} where T <: AbstractString
-    
+
     if weights === nothing
         weights = UnitWeights{Int}(length(data))
     end
@@ -486,14 +424,14 @@ function xtile(
     sorted_categories = sortperm(category_weights, rev=true)
     step = max(1, round(Int, length(sorted_categories) / n_quantiles))
     cuts = unique(data)[sorted_categories][1:step:end]
-   
+
     return searchsortedlast.(Ref(cuts), data)
 
 end
 
 # Dealing with missing and Numbers
 function xtile(
-    data::AbstractVector{T}, 
+    data::AbstractVector{T},
     n_quantiles::Integer;
     weights::Union{Weights{<:Real}, Nothing} = nothing
 )::Vector{Union{Int, Missing}} where {T <: Union{Missing, AbstractString, Number}}
@@ -526,4 +464,3 @@ end
 
 
 
-
diff --git a/src/TimeShift.jl b/src/TimeShift.jl
@@ -279,7 +279,3 @@ end
 
 
 
-
-
-
-
diff --git a/src/Winsorize.jl b/src/Winsorize.jl
@@ -1,30 +1,29 @@
 # ------------------------------------------------------------------------------------------
 """
-    winsorize(
-        x::AbstractVector; 
-        probs::Union{Tuple{Real, Real}, Nothing} = nothing,
-        cutpoints::Union{Tuple{Real, Real}, Nothing} = nothing,
-        replace::Symbol = :missing
-        verbose::Bool=false
-    )
+    winsorize(x::AbstractVector;
+        probs=nothing, cutpoints=nothing, replace_value=nothing,
+        IQR=3, verbose=false)
+
+Winsorize (clip extreme values) in a vector.
+Based on Matthieu Gomez's winsorize function in the `statar` R package.
 
 # Arguments
 - `x::AbstractVector`: a vector of values
 
 # Keywords
-- `probs::Union{Tuple{Real, Real}, Nothing}`: A vector of probabilities that can be used instead of cutpoints
-- `cutpoints::Union{Tuple{Real, Real}, Nothing}`: Cutpoints under and above which are defined outliers. Default is (median - five times interquartile range, median + five times interquartile range). Compared to bottom and top percentile, this takes into account the whole distribution of the vector
-- `replace_value::Tuple`:  Values by which outliers are replaced. Default to cutpoints. A frequent alternative is missing. 
-- `IQR::Real`: when inferring cutpoints what is the multiplier from the median for the interquartile range. (median ± IQR * (q75-q25))
-- `verbose::Bool`: printing level
+- `probs::Union{Tuple{Real, Real}, Nothing}`: Probability bounds for cutpoints (e.g., `(0.05, 0.95)`)
+- `cutpoints::Union{Tuple{Real, Real}, Nothing}`: Explicit cutpoints for outlier thresholds.
+    Default is `(median - IQR*(q75-q25), median + IQR*(q75-q25))`
+- `replace_value`: Values to replace outliers with. Default: cutpoint values.
+    Can be a tuple `(lo, hi)`, `missing`, or `(missing, missing)`
+- `IQR::Real=3`: Multiplier from the median for the interquartile range when inferring cutpoints
+- `verbose::Bool=false`: Print informational messages
 
 # Returns
-- `AbstractVector`: A vector the size of x with substituted values 
+- `AbstractVector`: A vector the size of x with substituted values
 
 # Examples
 - See tests
-
-This code is based on Matthieu Gomez winsorize function in the `statar` R package 
 """
 function winsorize(x::AbstractVector{T}; 
     probs::Union{Tuple{Real, Real}, Nothing} = nothing,
diff --git a/test/UnitTests/panel_fill.jl b/test/UnitTests/panel_fill.jl
@@ -58,7 +58,7 @@
             gap=Month(1), method=:backwards, uniquecheck=true, flag=true)
         @test isequal(
             select(subset(df3_test, :flag => ByRow(==(:backwards))), r"v"),
-            DataFrame(v1 = [1.0, 1.0, 0.0, 4.0, 11.0], 
+            DataFrame(v1 = [1.0, 1.0, 0.0, 4.0, 11.0],
                       v2 = [1.0, 1.0, 4.0, 2.0, 3.0],
                       v3 = [1.0, 1.0, 15.0, 22.5, 17.2]))
 
@@ -76,7 +76,7 @@
         @test isapprox(
             select(subset(df3_test, :flag => ByRow(==(:linear)), skipmissing=true), r"v") ,
             DataFrame(
-                v1 = [1.0, 1.0, 0.0, 7.5 , 12.0], 
+                v1 = [1.0, 1.0, 0.0, 7.5 , 12.0],
                 v2 = [1.333, 1.666, 4.5, 2.5, 3.5],
                 v3 = [2.3333, 3.666, 13.625, 19.85, 9.1]),
             atol = 0.01)
@@ -88,17 +88,58 @@
             select(subset(df3_test, :flag => ByRow(==(:nearest)), skipmissing=true), :v1),
             DataFrame(v1 = [1.0, 1.0, 0.0, 11.0, 13.0]))
 
-        # TODO clean up these tests
-
         # -- different time periods
-        # this fails
-        # panel_fill(df3, :id, :t, [:v1, :v2, :v3],
-            # gap=Month(2), method=:backwards, uniquecheck=true, flag=true, merge=true)
         df3_test = panel_fill(df3, :id, :t, [:v1, :v2, :v3],
             gap=Day(10), method=:forwards, uniquecheck=true, flag=true)
         @test isequal(nrow(df3_test) , 39)
 
     end
 
+end
+
+
+@testset "panel_fill - flag=false" begin
+    df = DataFrame(id = [1, 1, 2, 2], t = [1, 3, 1, 4], v = [10, 20, 30, 40])
+    result = panel_fill(df, :id, :t, :v, gap=1, method=:backwards, flag=false)
+    @test !(:flag in names(result))
+    @test nrow(result) > nrow(df)  # should have filled rows
+end
+
+
+@testset "panel_fill - invalid method" begin
+    df = DataFrame(id = [1, 1], t = [1, 3], v = [10, 20])
+    @test_throws Exception panel_fill(df, :id, :t, :v, gap=1, method=:invalid_method)
+end
+
+
+@testset "panel_fill - type mismatch" begin
+    # DatePeriod gap with integer time variable
+    df = DataFrame(id = [1, 1], t = [1, 3], v = [10, 20])
+    @test_throws Exception panel_fill(df, :id, :t, :v, gap=Month(1))
+end
+
+
+@testset "panel_fill - non-unique warning" begin
+    df = DataFrame(id = [1, 1, 1], t = [1, 2, 3], v = [10, 20, 30])
+    # non-unique: add a duplicate
+    df_dup = vcat(df, DataFrame(id = [1], t = [2], v = [99]))
+    # should warn about non-unique observations
+    @test_logs (:warn, r"non unique"i) begin
+        try
+            panel_fill(df_dup, :id, :t, :v,
+                gap=1, method=:backwards, uniquecheck=true, flag=true)
+        catch
+            # the function may error after warning due to duplicate handling;
+            # we just verify the warning is emitted
+        end
+    end
+end
+
 
+@testset "panel_fill - no gaps to fill" begin
+    # consecutive time values, nothing to interpolate
+    df = DataFrame(id = [1, 1, 1], t = [1, 2, 3], v = [10, 20, 30])
+    result = panel_fill(df, :id, :t, :v, gap=1, method=:backwards, flag=true)
+    @test nrow(result) == 3  # no new rows added
+    @test all(result.flag .== :original)
 end
diff --git a/test/UnitTests/tabulate.jl b/test/UnitTests/tabulate.jl
@@ -37,9 +37,9 @@
     @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum", "Hist."])
 
     # test the type columns get properly passed
-    @test contains(tabulate(df, [:island, :species], group_type = [:type, :value], out=:string), 
+    @test contains(tabulate(df, [:island, :species], group_type = [:type, :value], out=:string),
                    "island_typeof")
-    @test contains(tabulate(df, [:island, :species], group_type = [:value, :type], out=:string), 
+    @test contains(tabulate(df, [:island, :species], group_type = [:value, :type], out=:string),
                    "species_typeof")
 
     # test the twoway ad wide tabulate
@@ -58,14 +58,109 @@
     # test the group type options
     df = DataFrame(x = [1, 2, 2, "NA", missing], y = ["c", "c", "b", "z", "d"])
     @test isequal(
-        tabulate(df, [:x, :y], out=:df).y, 
+        tabulate(df, [:x, :y], out=:df).y,
         sort(df.y))
     @test nrow(tabulate(df, [:x, :y], group_type = :value, out=:df)) == 5
     @test nrow(tabulate(df, [:x, :y], group_type = :type, out=:df)) == 3
-    @test nrow(tabulate(df, [:x, :y], group_type = [:type, :value], out=:df)) == 4 
+    @test nrow(tabulate(df, [:x, :y], group_type = [:type, :value], out=:df)) == 4
     @test nrow(tabulate(df, [:x, :y], group_type = [:value, :type], out=:df)) == 4
 
 end
 
 
-# -- TODO: Add tests for results that include missing -
\ No newline at end of file
+@testset "Tabulate - wide format pct" begin
+    df = dropmissing(DataFrame(PalmerPenguins.load()))
+
+    # wide format with format_stat=:pct returns a DataFrame
+    df_pct = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:df)
+    @test df_pct isa DataFrame
+    @test nrow(df_pct) == 3
+    # pct columns should not have a totals column (unlike freq)
+    @test !any(contains.(names(df_pct), "Total"))
+
+    # wide format pct as string output
+    pt = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:string)
+    @test pt isa String
+    @test length(pt) > 0
+
+    # wide format pct stdout returns nothing
+    result = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:stdout)
+    @test isnothing(result)
+end
+
+
+@testset "Tabulate - wide format string output" begin
+    df = dropmissing(DataFrame(PalmerPenguins.load()))
+
+    # wide freq as string
+    pt = tabulate(df, [:island, :species], format_tbl=:wide, out=:string)
+    @test pt isa String
+    @test contains(pt, "Adelie")
+    @test contains(pt, "Gentoo")
+    @test contains(pt, "Chinstrap")
+
+    # 3-column wide as string
+    pt = tabulate(df, [:sex, :island, :species], format_tbl=:wide, out=:string)
+    @test pt isa String
+    @test contains(pt, "Adelie")
+end
+
+
+@testset "Tabulate - missing values" begin
+    # DataFrame with missing values in the tabulated column
+    df = DataFrame(x = [1, 2, missing, 1, missing, 3])
+    df_tab = tabulate(df, :x, out=:df)
+    @test nrow(df_tab) == 4  # 1, 2, 3, missing
+    @test sum(df_tab.freq) == 6
+    @test :freq in propertynames(df_tab)
+    @test :pct in propertynames(df_tab)
+    @test :cum in propertynames(df_tab)
+
+    # string output with missing values should not error
+    pt = tabulate(df, :x, out=:string)
+    @test pt isa String
+    @test contains(pt, "missing")
+
+    # two-column with missing
+    df = DataFrame(x = ["a", "b", missing, "a"], y = [1, 2, 3, missing])
+    df_tab = tabulate(df, [:x, :y], out=:df)
+    @test nrow(df_tab) == 4
+    @test sum(df_tab.freq) == 4
+end
+
+
+@testset "Tabulate - skip_stat vector" begin
+    df = dropmissing(DataFrame(PalmerPenguins.load()))
+
+    # skip multiple stats
+    pt = tabulate(df, :island, out=:string, skip_stat=[:freq_hist, :cum])
+    first_line = split(pt, '\n', limit=2)[1]
+    @test contains(first_line, "Freq")
+    @test contains(first_line, "Percent")
+    @test !contains(first_line, "Cum")
+    @test !contains(first_line, "Hist")
+
+    # skip just freq
+    pt = tabulate(df, :island, out=:string, skip_stat=:freq)
+    first_line = split(pt, '\n', limit=2)[1]
+    @test !contains(first_line, "Freq.")
+    @test contains(first_line, "Percent")
+end
+
+
+@testset "Tabulate - single row DataFrame" begin
+    df = DataFrame(x = ["only_value"])
+    df_tab = tabulate(df, :x, out=:df)
+    @test nrow(df_tab) == 1
+    @test df_tab.freq[1] == 1
+    @test df_tab.cum[1] == 100
+end
+
+
+@testset "Tabulate - reorder_cols=false" begin
+    df = DataFrame(x = ["c", "a", "b", "a", "c", "c"])
+    df_tab = tabulate(df, :x, reorder_cols=false, out=:df)
+    # without reordering, original groupby order is preserved
+    @test nrow(df_tab) == 3
+    @test sum(df_tab.freq) == 6
+end
diff --git a/test/UnitTests/timeshift.jl b/test/UnitTests/timeshift.jl
@@ -30,13 +30,13 @@
         sort!(df2, [:id, :t])
         transform!(
             groupby(df2, :id),
-            [:t, :v1] => 
-                ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true), 
+            [:t, :v1] =>
+                ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true),
                                v1_lag_mth = tlag(v1, t; n=Month(1), verbose=true) ) ) =>
                 [:v1_lag_day, :v1_lag_mth])
 
         @test all(ismissing.(df2.v1_lag_day))
-        @test isequal(df2.v1_lag_mth, 
+        @test isequal(df2.v1_lag_mth,
             [missing, missing, missing, 1, missing, 6, missing, missing, missing, missing, missing ])
 
     end
@@ -44,24 +44,24 @@
 
 
 # --------------------------------------------------------------------------------------------------
-    @testset "General tests" begin 
+    @testset "General tests" begin
 
     # --- test large datasets
     function generate_test_data(;size=50_000, gap_probability=0.1, seed=123)
         Random.seed!(seed)
-        
+
         # Start date and initialize arrays
         start_date = Date(2020, 1, 1)
         dates = Vector{Date}()
         x_values = Vector{Float64}()
-        
+
         # Generate dates with some gaps and corresponding x values
         current_date = start_date
         for i in 1:size
             # Add current date and value
             push!(dates, current_date)
             push!(x_values, sin(i/100) + 0.1*randn()) # Some noisy sine wave pattern
-            
+
             # Decide whether to introduce a gap (skip 1-5 days)
             if rand() < gap_probability
                 gap_size = rand(1:5)
@@ -71,7 +71,7 @@
                 current_date += Day(1)
             end
         end
-        
+
         # Create DataFrame
         df = DataFrame(date=dates, x=x_values)
         return df
@@ -83,15 +83,15 @@
 
     @time transform!(small_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag)
     @test nrow(subset(small_df, :x_lag => ByRow(!ismissing))) == 4525
-    
+
     @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day);
     @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth);
     @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr);
-    
+
     transform!(large_df, :date => ByRow(year) => :datey)
-    @test_throws r"time vector not sorted"i transform!(large_df, 
+    @test_throws r"time vector not sorted"i transform!(large_df,
         [:x, :datey] => ( (x, d) -> tlag(x, d, n=1)) => :x_lag_datey);
-    
+
     @test nrow(subset(large_df, :x_lag_day => ByRow(!ismissing)))    == 900_182
     @test nrow(subset(large_df, :x_lag_mth => ByRow(!ismissing)))    == 770_178
     @test nrow(subset(large_df, :x_lag_yr => ByRow(!ismissing)))     == 769_502
@@ -114,7 +114,7 @@
 
     import PanelShift
 
-    # note the api for this package differs slightly ... 
+    # note the api for this package differs slightly ...
     # PanelShift.tlag(time_variable, x)
     # BazelData.tlag(x, time_variable)
 
@@ -127,11 +127,11 @@
     @test isequal(x_shift, [5; 6; missing])
 
     x_shift = tlag([4;5;6], [1;2;3], n=2)
-    @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift)     
+    @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift)
     @test isequal(x_shift, [missing;missing;4])
 
     x_shift = tlead([4;5;6], [1;2;3], n=2)
-    @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift) 
+    @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift)
     @test isequal(x_shift, [6; missing; missing])
 
     # unit-length vector
@@ -143,12 +143,12 @@
     @test isequal(PanelShift.tlead([1], [1]), x_shift)
     @test isequal(x_shift, [missing])
 
-    # -- 
+    # --
     x_shift = tlag([1;2;3;4;5], [1;3;5;6;7], n=2)
     @test isequal(PanelShift.tlag([1;3;5;6;7], [1;2;3;4;5], 2), x_shift)
     @test isequal(x_shift, [missing; 1; 2; missing; 3])
 
-    x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2) 
+    x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2)
     @test isequal(PanelShift.tlag(float.([1;3;5;6;7]), [1;2;3;4;5], 2), x_shift)
     @test isequal(x_shift, [missing; 1; 2; missing; 3])
 
@@ -164,7 +164,7 @@
     x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=3)
     @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 3), x_shift)
     @test isequal(x_shift, [missing; missing; :apple; :banana; missing])
-        
+
 
     x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4)
     @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift)
@@ -174,11 +174,11 @@
     @test isequal(PanelShift.tlead([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift)
     @test isequal(x_shift, [missing; missing; missing; :strawberry; missing])
 
-    # indexed by dates 
+    # indexed by dates
     x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(1))
     @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(1)), x_shift)
     @test isequal(x_shift, [missing; 1; missing])
-    
+
     x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(2))
     @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(2)), x_shift)
     @test isequal(x_shift, [missing; missing; 2])
@@ -192,75 +192,44 @@
     @test isequal(PanelShift.tshift([1;2;3], [1;2;3], 1), x_shift)
     @test isequal(x_shift, tlag([1;2;3], [1;2;3], n=1))
 
-    # safeguards
-    # @test_throws ArgumentError PanelShift.tlag([1;2;2], [1,2,3])  # argcheck error unsorted t
-    @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2]) 
-    # @test_throws ArgumentError PanelShift.tlag([1;2;], [1,2,3])
-    @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3]) 
-    # @test_throws ArgumentError PanelShift.tlag([1;2;3], [1,2,3], 0)
-    @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0) 
+    # safeguards for tlag
+    @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2])
+    @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3])
+    @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0)
 
-    end 
+    end
 # --------------------------------------------------------------------------------------------------
 
 
-
 # --------------------------------------------------------------------------------------------------
-# benchmarking
-
-# using Chairmarks
-# large_df = generate_test_data(size=50_000_000, gap_probability=0.1);
-
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr)
+    @testset "tlead error paths" begin
+        # unsorted time vector
+        @test_throws r"time vector not sorted"i tlead([1, 2, 3], [3, 1, 2])
 
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x)) => :x_lag_day)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Month(1))) => :x_lag_mth)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Year(1))) => :x_lag_yr)
-
-
-
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d)) => :x_lag_day)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Month(1))) => :x_lag_mth)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Year(1))) => :x_lag_yr)
-
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x)) => :x_lag_day)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Month(1))) => :x_lag_mth)
-# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Year(1))) => :x_lag_yr)
-
-# --------------------------------------------------------------------------------------------------    
+        # mismatched lengths
+        @test_throws r"value and time vector"i tlead([1, 2], [1, 2, 3])
 
+        # zero shift
+        @test_throws r"shift value"i tlead([1, 2, 3], [1, 2, 3], n=0)
+    end
+# --------------------------------------------------------------------------------------------------
 
 
+# --------------------------------------------------------------------------------------------------
+    @testset "tshift edge cases" begin
+        # tshift with n=nothing should warn and default to lag
+        result = @test_logs (:warn, r"shift not specified"i) tshift([1, 2, 3], [1, 2, 3])
+        @test isequal(result, tlag([1, 2, 3], [1, 2, 3]))
+
+        # tshift with Date vectors
+        dates = [Date(2020, 1, 1), Date(2020, 1, 2), Date(2020, 1, 3)]
+        result = tshift([10, 20, 30], dates, n=Day(1))
+        @test isequal(result, tlag([10, 20, 30], dates, n=Day(1)))
+
+        result = tshift([10, 20, 30], dates, n=Day(-1))
+        @test isequal(result, tlead([10, 20, 30], dates, n=Day(1)))
+    end
+# --------------------------------------------------------------------------------------------------
 
 
 end
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/test/UnitTests/winsorize.jl b/test/UnitTests/winsorize.jl
@@ -1,13 +1,13 @@
 @testset "winsorize" begin
 
-    Random.seed!(3); 
+    Random.seed!(3);
     x1 = rand(100);
     x2 = Vector{Union{Float64, Missing}}(rand(Float64, 100)); x2[rand(collect(1:100), 5)] .= missing;
 
 # --- tests on non-missing vectors
     x1_win = winsorize(x1, probs=(0.05, 0.95), verbose=true);
     @test findall(x1 .!= x1_win) == [4, 15, 26, 32, 40, 44, 52, 59, 64, 97]
-    
+
     x1_win = winsorize(x1; verbose=true);
     @test findall(x1 .!= x1_win) == []
 
@@ -21,7 +21,7 @@
     x2_win = winsorize(x2, probs=(0.02, 0.98), verbose=true);
     @test size(x2) == size(x2_win)
     @test findall(skipmissing(x2 .!= x2_win)) == [5, 41, 83, 91]
-    
+
     x2_win = winsorize(x2; verbose=true)
     @test size(x2) == size(x2_win)
     @test findall(skipmissing(x2 .!= x2_win)) == []
@@ -43,12 +43,69 @@
     @test size(x2) == size(x2_win)
     @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
 
-    # we check that this works if the type of replace is slightly different ... 
+    # we check that this works if the type of replace is slightly different ...
     # maybe we want to change this ...
     x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(-1, 1), verbose=true)
     @test size(x2) == size(x2_win)
     @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91]
 
+end
+
+
+@testset "winsorize - custom IQR" begin
+    Random.seed!(42)
+    x = randn(1000)  # standard normal: outliers likely beyond ~3σ
+
+    # default IQR=3 should keep most data
+    w_default = winsorize(x)
+    n_changed_default = count(x .!= w_default)
+
+    # IQR=1 should clip more aggressively
+    w_tight = winsorize(x, IQR=1)
+    n_changed_tight = count(x .!= w_tight)
+    @test n_changed_tight > n_changed_default
+
+    # IQR=100 should clip almost nothing
+    w_loose = winsorize(x, IQR=100)
+    @test count(x .!= w_loose) == 0
+end
+
+
+@testset "winsorize - edge cases" begin
+    # all identical values: nothing to winsorize
+    x_same = fill(5.0, 50)
+    w = winsorize(x_same, probs=(0.05, 0.95))
+    @test w == x_same
+
+    # single-element vector
+    x_one = [3.14]
+    w = winsorize(x_one, probs=(0.1, 0.9))
+    @test w == x_one
+
+    # integer vector
+    x_int = collect(1:100)
+    w = winsorize(x_int, probs=(0.05, 0.95))
+    @test length(w) == 100
+    @test minimum(w) >= minimum(x_int)
+    @test maximum(w) <= maximum(x_int)
+    @test count(w .!= x_int) > 0  # some values should be clipped
+
+    # one-sided winsorize: only clip top
+    Random.seed!(1)
+    x = rand(100)
+    w = winsorize(x, cutpoints=(minimum(x), 0.5))
+    @test minimum(w) == minimum(x)  # bottom unchanged
+    @test maximum(w) <= 0.5
+
+    # one-sided: only clip bottom
+    w = winsorize(x, cutpoints=(0.5, maximum(x)))
+    @test minimum(w) >= 0.5
+    @test maximum(w) == maximum(x)  # top unchanged
+end
 
 
+@testset "winsorize - all missing" begin
+    x_all_missing = Vector{Union{Float64, Missing}}(fill(missing, 10))
+    # probs path uses skipmissing which will be empty - quantile on empty should error
+    @test_throws Exception winsorize(x_all_missing, probs=(0.05, 0.95))
 end
diff --git a/test/UnitTests/xtile.jl b/test/UnitTests/xtile.jl
@@ -51,5 +51,51 @@
     @test isequal(xtile(s_m, 3), [1, 1, 2, missing, 1, missing, 3])
     @test isequal(xtile(s_m, 20), [1, 2, 4, missing, 2, missing, 5])
 
+end
 
-end-
\ No newline at end of file
+
+@testset "xtile - edge cases" begin
+
+    # all-missing input
+    x_all_missing = Vector{Union{Int64, Missing}}(fill(missing, 10))
+    result = xtile(x_all_missing, 4)
+    @test all(ismissing, result)
+    @test length(result) == 10
+
+    # single-element vector: searchsortedlast puts the value at the last quantile
+    result = xtile([42.0], 5)
+    @test length(result) == 1
+    @test result[1] isa Int
+
+    result = xtile([42], 5)
+    @test length(result) == 1
+
+    result = xtile(["hello"], 3)
+    @test length(result) == 1
+
+    # single-element with missing wrapper
+    x_single_m = Union{Int, Missing}[7]
+    result = xtile(x_single_m, 3)
+    @test length(result) == 1
+    @test !ismissing(result[1])
+
+    # two-element vector: results should be valid bin indices
+    result = xtile([1.0, 2.0], 2)
+    @test length(result) == 2
+    @test result[1] < result[2]  # lower value gets lower bin
+
+    # all identical values: all should get the same bin
+    x_same = fill(5.0, 100)
+    result = xtile(x_same, 10)
+    @test allequal(result)
+    @test length(result) == 100
+
+    # n_quantiles == 1: only the max element gets bin 1 (searchsortedlast behavior)
+    result = xtile(rand(50), 1)
+    @test all(r -> r in (0, 1), result)
+
+    # large n_quantiles: bins are bounded by n_quantiles
+    result = xtile(rand(100), 10)
+    @test all(r -> 0 <= r <= 10, result)
+
+end

	BazerData.jl Data manipulation utilities for Julia
	Log \| Files \| Refs \| README \| LICENSE

M	docs/src/index.md	\|	12	++++++------
M	src/PanelData.jl	\|	24	+++++++++++-------------
M	src/StataUtils.jl	\|	431	++++++++++++++++++++++++++++++++++---------------------------------------------
M	src/TimeShift.jl	\|	4	----
M	src/Winsorize.jl	\|	29	++++++++++++++---------------
M	test/UnitTests/panel_fill.jl	\|	55	++++++++++++++++++++++++++++++++++++++++++++++++-------
M	test/UnitTests/tabulate.jl	\|	106	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
M	test/UnitTests/timeshift.jl	\|	129	++++++++++++++++++++++++++++++-------------------------------------------------
M	test/UnitTests/winsorize.jl	\|	65	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	test/UnitTests/xtile.jl	\|	49	+++++++++++++++++++++++++++++++++++++++++++++++--