Diagnostics.jl (2881B)
1 # -------------------------------------------------------------------------------------------------- 2 # Diagnostics.jl 3 4 # Data quality diagnostics for financial DataFrames 5 # -------------------------------------------------------------------------------------------------- 6 7 8 # -------------------------------------------------------------------------------------------------- 9 """ 10 diagnose(df; id_col=:permno, date_col=:date, ret_col=:ret, price_col=:prc) 11 12 Run data quality diagnostics on a financial DataFrame. 13 14 # Arguments 15 - `df::AbstractDataFrame`: The data to diagnose 16 17 # Keywords 18 - `id_col::Symbol=:permno`: Entity identifier column 19 - `date_col::Symbol=:date`: Date column 20 - `ret_col::Union{Nothing,Symbol}=:ret`: Return column (set to `nothing` to skip) 21 - `price_col::Union{Nothing,Symbol}=:prc`: Price column (set to `nothing` to skip) 22 23 # Returns 24 - `Dict{Symbol, Any}` with keys: 25 - `:nrow`, `:ncol` — dimensions 26 - `:missing_rates` — `Dict{Symbol, Float64}` fraction missing per column 27 - `:duplicate_keys` — count of duplicate (id, date) pairs (if both columns exist) 28 - `:suspicious_values` — `Vector{String}` descriptions of anomalies found 29 30 # Examples 31 ```julia 32 df = import_MSF(conn; date_range=(Date("2020-01-01"), Date("2022-12-31"))) 33 report = diagnose(df) 34 report[:missing_rates] # Dict(:permno => 0.0, :ret => 0.02, ...) 35 report[:duplicate_keys] # 0 36 report[:suspicious_values] # ["15 returns outside [-100%, +100%]"] 37 ``` 38 """ 39 function diagnose(df::AbstractDataFrame; 40 id_col::Symbol=:permno, date_col::Symbol=:date, 41 ret_col::Union{Nothing,Symbol}=:ret, 42 price_col::Union{Nothing,Symbol}=:prc) 43 44 report = Dict{Symbol, Any}() 45 report[:nrow] = nrow(df) 46 report[:ncol] = ncol(df) 47 48 # Missing rates 49 missing_rates = Dict{Symbol, Float64}() 50 for col in names(df) 51 col_sym = Symbol(col) 52 missing_rates[col_sym] = nrow(df) > 0 ? count(ismissing, df[!, col]) / nrow(df) : 0.0 53 end 54 report[:missing_rates] = missing_rates 55 56 # Duplicate keys 57 if id_col in propertynames(df) && date_col in propertynames(df) 58 report[:duplicate_keys] = nrow(df) - nrow(unique(df, [id_col, date_col])) 59 end 60 61 # Suspicious values 62 suspicious = String[] 63 if !isnothing(ret_col) && ret_col in propertynames(df) 64 n_extreme = count(r -> !ismissing(r) && (r > 1.0 || r < -1.0), df[!, ret_col]) 65 n_extreme > 0 && push!(suspicious, "$n_extreme returns outside [-100%, +100%]") 66 end 67 if !isnothing(price_col) && price_col in propertynames(df) 68 n_neg = count(r -> !ismissing(r) && r < 0, df[!, price_col]) 69 n_neg > 0 && push!(suspicious, "$n_neg negative prices (CRSP convention for bid/ask midpoint)") 70 end 71 report[:suspicious_values] = suspicious 72 73 return report 74 end 75 # --------------------------------------------------------------------------------------------------