FinanceRoutines.jl

Financial data routines for Julia
Log | Files | Refs | README | LICENSE

Merge_CRSP_Comp.jl (11462B)


      1 #! /usr/bin/env julia
      2 # --------------------------------------------------------------------------------------------------
      3 # Merge_CRSP_Comp.jl
      4 
      5 # Collection of functions that get the link files from crsp/compustat
      6 # --------------------------------------------------------------------------------------------------
      7 
      8 
      9 # --------------------------------------------------------------------------------------------------
     10 # List of exported functions
     11 # export link_Funda
     12 # export link_MSF
     13 # --------------------------------------------------------------------------------------------------
     14 
     15 
     16 
     17 # --------------------------------------------------------------------------------------------------
     18 """
     19     import_ccm_link(wrds_conn::Connection)
     20     import_ccm_link(; user::String="", password::String="")
     21 
     22 Import and process the CRSP/Compustat Merged (CCM) linking table from WRDS.
     23 
     24 Downloads the CCM linking table that maps between CRSP's PERMNO and Compustat's GVKEY 
     25 identifiers, enabling cross-database research between CRSP and Compustat datasets.
     26 
     27 # Arguments
     28 ## Method 1
     29 - `wrds_conn::Connection`: An established database connection to WRDS PostgreSQL server
     30 
     31 ## Method 2 (Keyword Arguments)
     32 - `user::String=""`: WRDS username. If empty, attempts to use default connection via `open_wrds_pg()`
     33 - `password::String=""`: WRDS password. Only used if `user` is provided
     34 
     35 # Returns
     36 - `DataFrame`: Processed linking table with the following columns:
     37   - `:gvkey`: Compustat's permanent company identifier (converted to Int)
     38   - `:permno`: CRSP's permanent security identifier (renamed from `:lpermno`)
     39   - `:linkdt`: Start date of the link validity period
     40   - `:linkenddt`: End date of the link validity period (missing values set to today's date)
     41   - `:linkprim`: Primary link marker (String3 type)
     42   - `:liid`: IID of the linked CRSP issue (String3 type)
     43   - `:linktype`: Type of link (String3 type)
     44   - Additional columns from the original CRSP.CCMXPF_LNKHIST table
     45 
     46 # Processing Steps
     47 1. Downloads the complete CRSP.CCMXPF_LNKHIST table from WRDS
     48 2. Converts integer columns to proper Int type (handling missing values)
     49 3. Parses GVKEY from string to integer format
     50 4. Converts link descriptors to String3 type for efficiency
     51 5. Filters to keep only primary links:
     52    - Link types: "LU" (US companies), "LC" (Canadian), "LS" (ADRs)
     53    - Link primary: "P" (Primary) or "C" (Primary after CUSIP link)
     54 6. Sets missing end dates to today's date (assuming link is still active)
     55 7. Renames `:lpermno` to `:permno` for consistency
     56 
     57 # Examples
     58 ```julia
     59 # Using existing connection
     60 wrds_conn = open_wrds_pg("myusername", "mypassword")
     61 df_linktable = import_ccm_link(wrds_conn)
     62 
     63 # Using automatic connection
     64 df_linktable = import_ccm_link()
     65 
     66 # Using credentials directly
     67 df_linktable = import_ccm_link(user="myusername", password="mypassword")
     68 ```
     69 
     70 # Notes
     71 - Requires active WRDS subscription and PostgreSQL access
     72 - Only primary security links are retained (see WRDS CCM documentation for link type details)
     73 - Missing link end dates are interpreted as currently active links
     74 - The function uses `@p` macro for pipeline operations and `@debug` for logging
     75 - All date columns (`:linkdt`, `:linkenddt`) and `:permno` are set as non-missing
     76 
     77 # References
     78 - WRDS CCM Database documentation: https://wrds-www.wharton.upenn.edu/pages/support/manuals-and-overviews/crsp/crspcompustat-merged-ccm/
     79 
     80 See also: [`link_Funda`](@ref), [`link_MSF`](@ref), [`open_wrds_pg`](@ref)
     81 """
     82 function import_ccm_link(wrds_conn::Connection)
     83 
     84     # Download link table
     85     postgre_query_linktable = """
     86         SELECT *
     87             FROM crsp.ccmxpf_lnkhist
     88     """
     89     res_q_linktable = execute(wrds_conn, postgre_query_linktable)
     90 
     91     df_linktable = DataFrame(columntable(res_q_linktable))
     92     transform!(df_linktable, names(df_linktable, check_integer.(eachcol(df_linktable))) .=>
     93         (x->convert.(Union{Missing, Int}, x));
     94         renamecols = false);
     95     transform!(df_linktable, :gvkey => ByRow(x->parse(Int, x)) => :gvkey);
     96     transform!(df_linktable, [:linkprim, :liid, :linktype] .=> ByRow(String3), renamecols=false)
     97 
     98     # Prepare the table
     99     @p df_linktable |> filter!(_.linktype ∈ ("LU", "LC", "LS") && _.linkprim ∈ ("P", "C") )
    100     df_linktable[ ismissing.(df_linktable.linkenddt), :linkenddt ] .= Dates.today();
    101     disallowmissing!(df_linktable, [:linkdt, :linkenddt, :lpermno]);
    102     @debug "renaming lpermno in linktable to permno"
    103     rename!(df_linktable, :lpermno => :permno);
    104 
    105     return df_linktable
    106 end
    107 
    108 
    109 # when there are no connections establisheds
    110 function import_ccm_link(;
    111     user::String = "", password::String = "")
    112 
    113     with_wrds_connection(user=user, password=password) do conn
    114         import_ccm_link(conn)
    115     end
    116 end
    117 # --------------------------------------------------------------------------------------------------
    118 
    119 
    120 # --------------------------------------------------------------------------------------------------
    121 """
    122     link_Funda(df_linktable::DataFrame, df_funda::DataFrame, variables::Vector{Symbol}=Symbol[])
    123 
    124 Link Compustat fundamentals data with CRSP security identifiers using a linking table.
    125 
    126 This function performs a temporal join between Compustat fundamental data and a security 
    127 linking table (typically CRSP/Compustat Merged Database linking table) to assign PERMNO 
    128 identifiers to Compustat records based on valid date ranges.
    129 
    130 # Arguments
    131 - `df_linktable::DataFrame`: Linking table containing the mapping between GVKEY and PERMNO 
    132   identifiers. Must include columns:
    133   - `:gvkey`: Compustat's permanent company identifier
    134   - `:linkdt`: Start date of the link validity period
    135   - `:linkenddt`: End date of the link validity period
    136   - `:permno`: CRSP's permanent security identifier
    137   - Additional columns that will be removed: `:linkprim`, `:liid`, `:linktype`
    138 
    139 - `df_funda::DataFrame`: Compustat fundamentals data. Must include columns:
    140   - `:gvkey`: Compustat's permanent company identifier
    141   - `:datadate`: Date of the fundamental data observation
    142 
    143 - `variables::Vector{Symbol}=Symbol[]`: which existing variables in the dataframe do we want to keep
    144 
    145 # Returns
    146 - `DataFrame`: Joined dataset containing all columns from `df_funda` plus `:permno` from 
    147   the linking table. The following columns from the linking table are excluded from output:
    148   `:gvkey_1`, `:linkprim`, `:liid`, `:linktype`, `:linkdt`, `:linkenddt`
    149 
    150 # Details
    151 The function performs an inner join where:
    152 1. Records are matched on `:gvkey`
    153 2. The `:datadate` from fundamentals must fall within the valid link period 
    154    `[linkdt, linkenddt]` from the linking table
    155 
    156 This ensures that each fundamental data observation is matched with the correct PERMNO 
    157 based on the security's identification history, handling cases where companies change 
    158 their CRSP identifiers over time.
    159 
    160 # Examples
    161 ```julia
    162 # Load linking table and fundamentals data
    163 df_linktable = load_ccm_links()
    164 df_funda = load_compustat_funda()
    165 
    166 # Link the datasets
    167 linked_data = link_Funda(df_linktable, df_funda)
    168 # Result contains fundamental data with PERMNO identifiers added
    169 ```
    170 
    171 # Notes
    172 Uses FlexiJoins.innerjoin for temporal joining capabilities
    173 Only records with valid links during the observation date are retained
    174 """
    175 function link_Funda(df_linktable::DataFrame, df_funda::DataFrame,
    176     variables::Vector{Symbol}=Symbol[])
    177 
    178     funda_link_permno = FlexiJoins.innerjoin(
    179         (select(df_funda, :gvkey, :datadate), df_linktable),
    180         by_key(:gvkey) & by_pred(:datadate, ∈, x->x.linkdt..x.linkenddt) )
    181     
    182     variables_to_remove = vcat(:gvkey_1,
    183         setdiff([:linkprim, :liid, :linktype, :linkdt, :linkenddt], variables) )
    184 
    185     select!(funda_link_permno,
    186         Not(variables_to_remove))
    187 
    188     return funda_link_permno
    189 
    190 end
    191 # ------------------------------------------------------------------------------------------
    192 
    193 
    194 
    195 # ------------------------------------------------------------------------------------------
    196 """
    197     link_MSF(df_linktable::DataFrame, df_msf::DataFrame; variables::Vector{Symbol}=Symbol[])
    198 
    199 Link CRSP monthly stock file data with Compustat identifiers using a linking table.
    200 
    201 This function performs a temporal join to add GVKEY (Compustat identifiers) to CRSP monthly 
    202 stock data, enabling cross-database analysis between CRSP and Compustat datasets.
    203 
    204 # Arguments
    205 - `df_linktable::DataFrame`: Linking table containing the mapping between PERMNO and GVKEY 
    206   identifiers. Must include columns:
    207   - `:permno`: CRSP's permanent security identifier
    208   - `:gvkey`: Compustat's permanent company identifier
    209   - `:linkdt`: Start date of the link validity period
    210   - `:linkenddt`: End date of the link validity period
    211 
    212 - `df_msf::DataFrame`: CRSP monthly stock file data. Must include columns:
    213   - `:permno`: CRSP's permanent security identifier
    214   - `:date`: Date of the stock observation
    215   - Additional columns as specified in `variables` (if any)
    216 
    217 # Keyword Arguments
    218 - `variables::Vector{Symbol}=Symbol[]`: Optional list of additional columns to retain from 
    219   the linking process. Only columns that exist in both datasets will be kept.
    220 
    221 # Returns
    222 - `DataFrame`: Original CRSP data with GVKEY identifiers added where valid links exist. 
    223   Includes:
    224   - All original columns from `df_msf`
    225   - `:gvkey`: Compustat identifier (where available)
    226   - `:datey`: Year extracted from the `:date` column
    227   - Any additional columns specified in `variables` that exist in the joined data
    228 
    229 # Details
    230 The function performs a two-step process:
    231 1. **Inner join with temporal filtering**: Matches CRSP records to the linking table where 
    232    the stock date falls within the valid link period `[linkdt, linkenddt]`
    233 2. **Left join back to original data**: Ensures all original CRSP records are retained, 
    234    with GVKEY values added only where valid links exist
    235 
    236 Records with missing GVKEY values after the initial join are filtered out before the 
    237 merge-back step, ensuring only valid links are propagated.
    238 
    239 # Examples
    240 ```julia
    241 # Load data
    242 df_linktable = load_ccm_links()
    243 df_msf = load_crsp_monthly()
    244 
    245 # Basic linking
    246 linked_msf = link_MSF(df_linktable, df_msf)
    247 
    248 # Include additional variables from the linking table
    249 linked_msf = link_MSF(df_linktable, df_msf, variables=[:linkprim, :linktype])
    250 ```
    251 """
    252 function link_MSF(df_linktable::DataFrame, df_msf::DataFrame;
    253     variables::Vector{Symbol}=Symbol[])
    254 
    255 # Merge with CRSP
    256     df_msf_linked = FlexiJoins.innerjoin(
    257         (df_msf, df_linktable),
    258         by_key(:permno) & by_pred(:date, ∈, x->x.linkdt..x.linkenddt)
    259     )
    260     @p df_msf_linked |> filter!(.!ismissing.(_.gvkey))
    261     col_keep = vcat([:date, :permno, :gvkey], intersect(variables, propertynames(df_msf_linked))) |> unique
    262     select!(df_msf_linked, col_keep)
    263     
    264 # merge this back
    265     df_msf_linked = leftjoin(df_msf, df_msf_linked, on = [:date, :permno], source="_merge")
    266     transform!(df_msf_linked, :date => ByRow(year) => :datey)
    267     select!(df_msf_linked, Not(:_merge))
    268 
    269 
    270     return df_msf_linked
    271 end
    272 # ------------------------------------------------------------------------------------------
    273 
    274 
    275 
    276 # ------------------------------------------------------------------------------------------
    277 # function link_ccm(df_linktable, df_msf, df_funda)
    278 
    279 # # ccm
    280 #     df_ccm = leftjoin(
    281 #         df_msf_merged, df_funda,
    282 #         on = [:gvkey, :datey], matchmissing = :notequal)
    283 
    284 #     if save
    285 #         CSV.write("./tmp/ccm.csv.gz", df_ccm, compress=true)
    286 #     end
    287 
    288 # end
    289 # ------------------------------------------------------------------------------------------