BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

commit 1203c402a3c50602f5ea707c84134bcf6f58bfc0
parent 6109d8abafa19af3cc4bce88c555bec2d7c2bb5f
Author: Erik Loualiche <[email protected]>
Date:   Tue, 20 May 2025 17:46:01 -0500

loosen project.toml restrictions

Diffstat:
MProject.toml | 7++++---
MREADME.md | 35+++++++++++++++++++++++++++++++++++
Msrc/BazerData.jl | 3+++
Asrc/TimeShift.jl | 147+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/UnitTests/timeshift.jl | 266+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/runtests.jl | 1+
6 files changed, 456 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml @@ -19,17 +19,18 @@ ColorSchemes = "3.29.0" Crayons = "4.1.1" DataFrames = "1.7.0" Dates = "1.11.0" -Interpolations = "0.16.1" +Interpolations = ">= 0.15" Missings = "1.2.0" PrettyTables = "2.4.0" Random = "1.11.0" -StatsBase = "0.34.5" +StatsBase = " >= 0.30" julia = ">= 1.10.9" [extras] PalmerPenguins = "8b842266-38fa-440a-9b57-31493939ab85" +PanelShift = "d68e4d5e-4a60-4df1-b225-9a1636c75ae0" StreamToString = "dc918f9c-79cc-42e6-85f1-d8b9b09632f4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "PalmerPenguins", "StreamToString"] +test = ["Test", "PalmerPenguins", "StreamToString", "PanelShift"] diff --git a/README.md b/README.md @@ -13,6 +13,7 @@ So far the package provides a four functions 2. create category based on quantile ([`xtile`](#xtile)) 3. winsorize some data ([`winsorize`](#winsorize-data)) 4. fill unbalanced panel data ([`panel_fill`](#filling-an-unbalanced-panel)) + 5. lead and lag functions ([`tlead|tlag`](#leads-and-lags)) Note that as the package grow in different directions, dependencies might become overwhelming. The readme serves as documentation; there might be more examples inside of the test folder. @@ -104,7 +105,41 @@ panel_fill(df_panel, :id, :t, [:v1, :v2, :v3], gap=Month(1), method=:linear, uniquecheck=true, flag=true, merge=true) ``` +### Leads and lags +This is largely "borrowed" (copied) from @FuZhiyu [`PanelShift.jl`](https://github.com/FuZhiyu/PanelShift.jl) package. +```julia +t, v = [1;2;4], [1;2;3]; +julia> tlag(t, v) # the default lag period is the unitary difference in t, here 1 +3-element Vector{Union{Missing, Int64}}: + missing + 1 + missing + + +julia> tlag(t, v, 2) # we can also specify lags using the third argument +3-element Vector{Union{Missing, Int64}}: + missing + missing + 2 + + +julia> using Dates; +julia> t = [Date(2020,1,1); Date(2020,1,2); Date(2020,1,4)]; +julia> tlag(t, [1, 2, 3]) # customized types of the time vector are also supported +3-element Vector{Union{Missing, Int64}}: + missing + 1 + missing + + +julia> tlag(t, [1, 2, 3], Day(2)) # specify two-day lags +3-element Vector{Union{Missing, Int64}}: + missing + missing + 2 + +``` ## Other stuff diff --git a/src/BazerData.jl b/src/BazerData.jl @@ -3,6 +3,7 @@ module BazerData # -------------------------------------------------------------------------------------------------- import ColorSchemes: get, colorschemes import Crayons: @crayon_str +import Dates: Date import DataFrames: AbstractDataFrame, ByRow, DataFrame, groupby, combine, nrow, Not, nonunique, proprow, rename, rename!, select, select!, transform, transform!, unstack import Dates: format, now, DatePeriod, Dates, Dates.AbstractTime, ISODateTimeFormat @@ -17,6 +18,7 @@ import StatsBase: quantile, UnitWeights, Weights # -------------------------------------------------------------------------------------------------- # Import functions include("PanelData.jl") +include("TimeShift.jl") include("StataUtils.jl") include("Winsorize.jl") # -------------------------------------------------------------------------------------------------- @@ -25,6 +27,7 @@ include("Winsorize.jl") # -------------------------------------------------------------------------------------------------- # List of exported functions export panel_fill, panel_fill! +export tlead, tlag, tshift export tabulate export xtile export winsorize diff --git a/src/TimeShift.jl b/src/TimeShift.jl @@ -0,0 +1,147 @@ +# -------------------------------------------------------------------------------------------------- +# most of this code was copied from @FuZhiyu PanelShift.jl package +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +function tlag(x, t_vec; + n = nothing, + checksorted = true, + verbose = false, + ) + + if isnothing(n) # this is the default + n = oneunit(t_vec[1] - t_vec[1]) + verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") : + (@info "Default gap inferred ... $n") ) + elseif eltype(t_vec) == Date + verbose && @info "No checks on increment argument n for type Date ... " + else + !(n isa typeof(t_vec[1]-t_vec[1])) && + error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))") + + end + + checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!") + !(n > zero(n)) && error("shift value has to be positive!") + + N = length(t_vec) + (length(x) != N) && error("value and time vector have different lengths!") + + x_shift = Array{Union{Missing, eltype(x)}}(missing, N); + + # _binary_search_lag!(x_shift, x, t_vec, n, N) + _linear_scan!(x_shift, x, t_vec, n, N) + + return x_shift + +end + +function _linear_scan!(x_shift, x, t_vec, n, N) + j = 0 + @inbounds for i in 1:N + # Calculate the target time we're looking for + lagt = t_vec[i] - n + # Scan forward from where we left off to find the largest index + # where t_vec[j] <= lagt (since t_vec is sorted) + while j < N && t_vec[j + 1] <= lagt + j += 1 + end + + # If we found a valid index and it's an exact match + if j > 0 && t_vec[j] == lagt + x_shift[i] = x[j] + # else + # x_shift[i] = missing + end + end + return x_shift +end +# -------------------------------------------------------------------------------------------------- + + + +# -------------------------------------------------------------------------------------------------- +# most of this code was inspired by @FuZhiyu PanelShift.jl package +function tlead(x, t_vec; + n = nothing, + checksorted = true, + verbose = false, + ) + + if isnothing(n) # this is the default + n = oneunit(t_vec[1] - t_vec[1]) + verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") : + (@info "Default gap inferred ... $n") ) + elseif eltype(t_vec) == Date + verbose && @info "No checks on increment argument n for date type ... " + else + !(n isa typeof(t_vec[1]-t_vec[1])) && + error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))") + end + + checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!") + !(n > zero(n)) && error("shift value has to be positive!") + + N = length(t_vec) + (length(x) != N) && error("value and time vector have different lengths!") + + x_shift = Array{Union{Missing, eltype(x)}}(missing, N); + _linear_scan_lead!(x_shift, x, t_vec, n, N) + return x_shift + +end + +function _linear_scan_lead!(x_shift, x, t_vec, n, N) + j = 0 + + @inbounds for i in 1:N + leadt = t_vec[i] + n + # Early termination if already past the end of the array + if leadt > t_vec[N] + # All remaining targets will be beyond the array bounds + break + end + + # Fast forward scan (can add loop unrolling here if needed) + while j < N && t_vec[j + 1] < leadt + j += 1 + end + # Check for exact match at the next position + if j + 1 <= N && t_vec[j + 1] == leadt + x_shift[i] = x[j + 1] + end + end + return x_shift + +end +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +function tshift(x, t_vec; n=nothing, kwargs...) + + if isnothing(n) + @warn "shift not specified ... defaulting to lag" + n = oneunit(t_vec[1] - t_vec[1]) + end + + if n > zero(n) + return tlag(x, t_vec, n=n; kwargs...) + else + return tlead(x, t_vec, n=-n; kwargs...) + end +end +# -------------------------------------------------------------------------------------------------- + + + + + + + + + + + + diff --git a/test/UnitTests/timeshift.jl b/test/UnitTests/timeshift.jl @@ -0,0 +1,266 @@ +@testset "Time Shift" begin + + +# -------------------------------------------------------------------------------------------------- + df1 = DataFrame( # missing t=2 for id=1 + id = ["a","a","b","b","c","c","c"], + t = [1,3,8,9,1,2,4], + v1 = [1,1,1,6,6,0,0], + v2 = [1,2,3,6,6,4,5], + v3 = [1,5,4,6,6,15,12.25]) + + df2 = DataFrame( # missing t=2 for id=1 + id = ["a","a", "b","b", "c","c","c", "d","d","d","d"], + t = [Date(1990, 1, 1), Date(1990, 4, 1), Date(1990, 8, 1), Date(1990, 9, 1), + Date(1990, 1, 1), Date(1990, 2, 1), Date(1990, 4, 1), + Date(1999, 11, 10), Date(1999, 12, 21), Date(2000, 2, 5), Date(2000, 4, 1)], + v1 = [1,1, 1,6, 6,0,0, 1,4,11,13], + v2 = [1,2,3,6,6,4,5, 1,2,3,4], + v3 = [1,5,4,6,6,15,12.25, 21,22.5,17.2,1]) + + # --- test for df1 + @testset "DF1" begin + sort!(df1, [:id, :t]) + transform!(groupby(df1, :id), [:t, :v2] => ( (d, x) -> tlag(x, d)) => :v2_lag) + @test isequal(df1.v2_lag, [missing, missing, missing, 3, missing, 6, missing]) + end + + # --- test for df2 multiple variables + @testset "DF2" begin + sort!(df2, [:id, :t]) + transform!( + groupby(df2, :id), + [:t, :v1] => + ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true), + v1_lag_mth = tlag(v1, t; n=Month(1), verbose=true) ) ) => + [:v1_lag_day, :v1_lag_mth]) + + @test all(ismissing.(df2.v1_lag_day)) + @test isequal(df2.v1_lag_mth, + [missing, missing, missing, 1, missing, 6, missing, missing, missing, missing, missing ]) + + end +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- + @testset "General tests" begin + + # --- test large datasets + function generate_test_data(;size=50_000, gap_probability=0.1, seed=123) + Random.seed!(seed) + + # Start date and initialize arrays + start_date = Date(2020, 1, 1) + dates = Vector{Date}() + x_values = Vector{Float64}() + + # Generate dates with some gaps and corresponding x values + current_date = start_date + for i in 1:size + # Add current date and value + push!(dates, current_date) + push!(x_values, sin(i/100) + 0.1*randn()) # Some noisy sine wave pattern + + # Decide whether to introduce a gap (skip 1-5 days) + if rand() < gap_probability + gap_size = rand(1:5) + current_date += Day(gap_size + 1) + else + # Normal increment + current_date += Day(1) + end + end + + # Create DataFrame + df = DataFrame(date=dates, x=x_values) + return df + end + + tiny_df = generate_test_data(size=50, gap_probability=0.05); + small_df = generate_test_data(size=5_000, gap_probability=0.1); + large_df = generate_test_data(size=1_000_000, gap_probability=0.1); + + @time transform!(small_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag) + @test nrow(subset(small_df, :x_lag => ByRow(!ismissing))) == 4525 + + @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day); + @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth); + @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr); + + transform!(large_df, :date => ByRow(year) => :datey) + @test_throws r"time vector not sorted"i transform!(large_df, + [:x, :datey] => ( (x, d) -> tlag(x, d, n=1)) => :x_lag_datey); + + @test nrow(subset(large_df, :x_lag_day => ByRow(!ismissing))) == 900_182 + @test nrow(subset(large_df, :x_lag_mth => ByRow(!ismissing))) == 770_178 + @test nrow(subset(large_df, :x_lag_yr => ByRow(!ismissing))) == 769_502 + + @time transform!(tiny_df, [:x, :date] => ( (x, d) -> tlead(x, d)) => :x_lead) + @time transform!(tiny_df, [:x_lead, :date] => ( (x, d) -> tlag(x, d)) => :x_lead_lag) + @test dropmissing(tiny_df) |> (df -> df.x == df.x_lead_lag) # lead lag reverts back up to destroyed information + + @time transform!(tiny_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Day(2)) ) => :x_lead2) + @time transform!(tiny_df, [:x_lead2, :date] => ( (x, d) -> tlag(tlag(x, d), d) ) => :x_lead2_lag2) + @test dropmissing(tiny_df) |> (df -> df.x == df.x_lead2_lag2) # lead lag reverts back up to destroyed information + + + end # of "General tests" +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- + @testset "From Panelshift.jl" begin + + import PanelShift + + # note the api for this package differs slightly ... + # PanelShift.tlag(time_variable, x) + # BazelData.tlag(x, time_variable) + + x_shift = tlag([4, 5, 6], [1, 2, 3]) + @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 1), x_shift) + @test isequal(x_shift, [missing, 4, 5]) + + x_shift = tlead([4, 5, 6], [1, 2, 3]) + @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 1), x_shift) + @test isequal(x_shift, [5; 6; missing]) + + x_shift = tlag([4;5;6], [1;2;3], n=2) + @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift) + @test isequal(x_shift, [missing;missing;4]) + + x_shift = tlead([4;5;6], [1;2;3], n=2) + @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift) + @test isequal(x_shift, [6; missing; missing]) + + # unit-length vector + x_shift = tlag([1], [1]) + @test isequal(PanelShift.tlag([1], [1]), x_shift) #[missing;]) + @test isequal(x_shift, [missing]) + + x_shift = tlead([1], [1]) + @test isequal(PanelShift.tlead([1], [1]), x_shift) + @test isequal(x_shift, [missing]) + + # -- + x_shift = tlag([1;2;3;4;5], [1;3;5;6;7], n=2) + @test isequal(PanelShift.tlag([1;3;5;6;7], [1;2;3;4;5], 2), x_shift) + @test isequal(x_shift, [missing; 1; 2; missing; 3]) + + x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2) + @test isequal(PanelShift.tlag(float.([1;3;5;6;7]), [1;2;3;4;5], 2), x_shift) + @test isequal(x_shift, [missing; 1; 2; missing; 3]) + + # non-numeric x and unequal gaps + x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=1) + @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 1), x_shift) + @test isequal(x_shift, [missing; :apple; missing; missing; missing]) + + x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=2) + @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 2), x_shift) + @test isequal(x_shift, [missing; missing; :orange; missing; missing]) + + x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=3) + @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 3), x_shift) + @test isequal(x_shift, [missing; missing; :apple; :banana; missing]) + + + x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4) + @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift) + @test isequal(x_shift, [missing; missing; missing; missing; :pineapple]) + + x_shift = tlead([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4) + @test isequal(PanelShift.tlead([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift) + @test isequal(x_shift, [missing; missing; missing; :strawberry; missing]) + + # indexed by dates + x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(1)) + @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(1)), x_shift) + @test isequal(x_shift, [missing; 1; missing]) + + x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(2)) + @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(2)), x_shift) + @test isequal(x_shift, [missing; missing; 2]) + + # test shift + x_shift = tshift([1;2;3], [1;2;3], n=-1) + @test isequal(PanelShift.tshift([1;2;3], [1;2;3], -1), x_shift) + @test isequal(x_shift, tlead([1;2;3], [1;2;3], n=1)) + + x_shift = tshift([1;2;3], [1;2;3], n=1) + @test isequal(PanelShift.tshift([1;2;3], [1;2;3], 1), x_shift) + @test isequal(x_shift, tlag([1;2;3], [1;2;3], n=1)) + + # safeguards + # @test_throws ArgumentError PanelShift.tlag([1;2;2], [1,2,3]) # argcheck error unsorted t + @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2]) + # @test_throws ArgumentError PanelShift.tlag([1;2;], [1,2,3]) + @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3]) + # @test_throws ArgumentError PanelShift.tlag([1;2;3], [1,2,3], 0) + @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0) + + end +# -------------------------------------------------------------------------------------------------- + + + +# -------------------------------------------------------------------------------------------------- +# benchmarking + +# using Chairmarks +# large_df = generate_test_data(size=50_000_000, gap_probability=0.1); + +# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day) +# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth) +# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr) + +# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x)) => :x_lag_day) +# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Month(1))) => :x_lag_mth) +# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Year(1))) => :x_lag_yr) + + + +# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d)) => :x_lag_day) +# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Month(1))) => :x_lag_mth) +# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Year(1))) => :x_lag_yr) + +# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x)) => :x_lag_day) +# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Month(1))) => :x_lag_mth) +# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Year(1))) => :x_lag_yr) + +# -------------------------------------------------------------------------------------------------- + + + + + +end + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/runtests.jl b/test/runtests.jl @@ -11,6 +11,7 @@ using StreamToString const testsuite = [ "tabulate", "xtile", "winsorize", "panel_fill", + "timeshift" ] ENV["DATADEPS_ALWAYS_ACCEPT"] = true # for data loading of PalmerPenguins