Diagnostics.jl (1411B)
1 @testset "Data Quality Diagnostics" begin 2 3 import Dates: Date 4 5 # Create test data with known issues 6 df = DataFrame( 7 permno = [1, 1, 1, 2, 2, 2], 8 date = [Date(2020,1,1), Date(2020,2,1), Date(2020,2,1), # duplicate for permno 1 9 Date(2020,1,1), Date(2020,3,1), Date(2020,4,1)], # gap for permno 2 10 ret = [0.05, missing, 0.03, -1.5, 0.02, 150.0], # suspicious: -1.5, 150.0 11 prc = [10.0, 20.0, 20.0, -5.0, 30.0, 40.0] # negative price 12 ) 13 allowmissing!(df, :ret) 14 15 report = diagnose(df) 16 17 # Basic structure 18 @test report[:nrow] == 6 19 @test report[:ncol] == 4 20 21 # Missing rates 22 @test haskey(report, :missing_rates) 23 @test report[:missing_rates][:ret] ≈ 1/6 24 @test report[:missing_rates][:permno] == 0.0 25 26 # Duplicates 27 @test haskey(report, :duplicate_keys) 28 @test report[:duplicate_keys] == 1 # one duplicate (permno=1, date=2020-02-01) 29 30 # Suspicious values 31 @test haskey(report, :suspicious_values) 32 @test length(report[:suspicious_values]) == 2 # extreme returns + negative prices 33 @test any(s -> occursin("returns outside", s), report[:suspicious_values]) 34 @test any(s -> occursin("negative prices", s), report[:suspicious_values]) 35 36 # Test with custom columns / no ret/prc 37 report2 = diagnose(df; ret_col=nothing, price_col=nothing) 38 @test isempty(report2[:suspicious_values]) 39 40 end