Merge_CRSP_Comp.jl (11462B)
1 #! /usr/bin/env julia 2 # -------------------------------------------------------------------------------------------------- 3 # Merge_CRSP_Comp.jl 4 5 # Collection of functions that get the link files from crsp/compustat 6 # -------------------------------------------------------------------------------------------------- 7 8 9 # -------------------------------------------------------------------------------------------------- 10 # List of exported functions 11 # export link_Funda 12 # export link_MSF 13 # -------------------------------------------------------------------------------------------------- 14 15 16 17 # -------------------------------------------------------------------------------------------------- 18 """ 19 import_ccm_link(wrds_conn::Connection) 20 import_ccm_link(; user::String="", password::String="") 21 22 Import and process the CRSP/Compustat Merged (CCM) linking table from WRDS. 23 24 Downloads the CCM linking table that maps between CRSP's PERMNO and Compustat's GVKEY 25 identifiers, enabling cross-database research between CRSP and Compustat datasets. 26 27 # Arguments 28 ## Method 1 29 - `wrds_conn::Connection`: An established database connection to WRDS PostgreSQL server 30 31 ## Method 2 (Keyword Arguments) 32 - `user::String=""`: WRDS username. If empty, attempts to use default connection via `open_wrds_pg()` 33 - `password::String=""`: WRDS password. Only used if `user` is provided 34 35 # Returns 36 - `DataFrame`: Processed linking table with the following columns: 37 - `:gvkey`: Compustat's permanent company identifier (converted to Int) 38 - `:permno`: CRSP's permanent security identifier (renamed from `:lpermno`) 39 - `:linkdt`: Start date of the link validity period 40 - `:linkenddt`: End date of the link validity period (missing values set to today's date) 41 - `:linkprim`: Primary link marker (String3 type) 42 - `:liid`: IID of the linked CRSP issue (String3 type) 43 - `:linktype`: Type of link (String3 type) 44 - Additional columns from the original CRSP.CCMXPF_LNKHIST table 45 46 # Processing Steps 47 1. Downloads the complete CRSP.CCMXPF_LNKHIST table from WRDS 48 2. Converts integer columns to proper Int type (handling missing values) 49 3. Parses GVKEY from string to integer format 50 4. Converts link descriptors to String3 type for efficiency 51 5. Filters to keep only primary links: 52 - Link types: "LU" (US companies), "LC" (Canadian), "LS" (ADRs) 53 - Link primary: "P" (Primary) or "C" (Primary after CUSIP link) 54 6. Sets missing end dates to today's date (assuming link is still active) 55 7. Renames `:lpermno` to `:permno` for consistency 56 57 # Examples 58 ```julia 59 # Using existing connection 60 wrds_conn = open_wrds_pg("myusername", "mypassword") 61 df_linktable = import_ccm_link(wrds_conn) 62 63 # Using automatic connection 64 df_linktable = import_ccm_link() 65 66 # Using credentials directly 67 df_linktable = import_ccm_link(user="myusername", password="mypassword") 68 ``` 69 70 # Notes 71 - Requires active WRDS subscription and PostgreSQL access 72 - Only primary security links are retained (see WRDS CCM documentation for link type details) 73 - Missing link end dates are interpreted as currently active links 74 - The function uses `@p` macro for pipeline operations and `@debug` for logging 75 - All date columns (`:linkdt`, `:linkenddt`) and `:permno` are set as non-missing 76 77 # References 78 - WRDS CCM Database documentation: https://wrds-www.wharton.upenn.edu/pages/support/manuals-and-overviews/crsp/crspcompustat-merged-ccm/ 79 80 See also: [`link_Funda`](@ref), [`link_MSF`](@ref), [`open_wrds_pg`](@ref) 81 """ 82 function import_ccm_link(wrds_conn::Connection) 83 84 # Download link table 85 postgre_query_linktable = """ 86 SELECT * 87 FROM crsp.ccmxpf_lnkhist 88 """ 89 res_q_linktable = execute(wrds_conn, postgre_query_linktable) 90 91 df_linktable = DataFrame(columntable(res_q_linktable)) 92 transform!(df_linktable, names(df_linktable, check_integer.(eachcol(df_linktable))) .=> 93 (x->convert.(Union{Missing, Int}, x)); 94 renamecols = false); 95 transform!(df_linktable, :gvkey => ByRow(x->parse(Int, x)) => :gvkey); 96 transform!(df_linktable, [:linkprim, :liid, :linktype] .=> ByRow(String3), renamecols=false) 97 98 # Prepare the table 99 @p df_linktable |> filter!(_.linktype ∈ ("LU", "LC", "LS") && _.linkprim ∈ ("P", "C") ) 100 df_linktable[ ismissing.(df_linktable.linkenddt), :linkenddt ] .= Dates.today(); 101 disallowmissing!(df_linktable, [:linkdt, :linkenddt, :lpermno]); 102 @debug "renaming lpermno in linktable to permno" 103 rename!(df_linktable, :lpermno => :permno); 104 105 return df_linktable 106 end 107 108 109 # when there are no connections establisheds 110 function import_ccm_link(; 111 user::String = "", password::String = "") 112 113 with_wrds_connection(user=user, password=password) do conn 114 import_ccm_link(conn) 115 end 116 end 117 # -------------------------------------------------------------------------------------------------- 118 119 120 # -------------------------------------------------------------------------------------------------- 121 """ 122 link_Funda(df_linktable::DataFrame, df_funda::DataFrame, variables::Vector{Symbol}=Symbol[]) 123 124 Link Compustat fundamentals data with CRSP security identifiers using a linking table. 125 126 This function performs a temporal join between Compustat fundamental data and a security 127 linking table (typically CRSP/Compustat Merged Database linking table) to assign PERMNO 128 identifiers to Compustat records based on valid date ranges. 129 130 # Arguments 131 - `df_linktable::DataFrame`: Linking table containing the mapping between GVKEY and PERMNO 132 identifiers. Must include columns: 133 - `:gvkey`: Compustat's permanent company identifier 134 - `:linkdt`: Start date of the link validity period 135 - `:linkenddt`: End date of the link validity period 136 - `:permno`: CRSP's permanent security identifier 137 - Additional columns that will be removed: `:linkprim`, `:liid`, `:linktype` 138 139 - `df_funda::DataFrame`: Compustat fundamentals data. Must include columns: 140 - `:gvkey`: Compustat's permanent company identifier 141 - `:datadate`: Date of the fundamental data observation 142 143 - `variables::Vector{Symbol}=Symbol[]`: which existing variables in the dataframe do we want to keep 144 145 # Returns 146 - `DataFrame`: Joined dataset containing all columns from `df_funda` plus `:permno` from 147 the linking table. The following columns from the linking table are excluded from output: 148 `:gvkey_1`, `:linkprim`, `:liid`, `:linktype`, `:linkdt`, `:linkenddt` 149 150 # Details 151 The function performs an inner join where: 152 1. Records are matched on `:gvkey` 153 2. The `:datadate` from fundamentals must fall within the valid link period 154 `[linkdt, linkenddt]` from the linking table 155 156 This ensures that each fundamental data observation is matched with the correct PERMNO 157 based on the security's identification history, handling cases where companies change 158 their CRSP identifiers over time. 159 160 # Examples 161 ```julia 162 # Load linking table and fundamentals data 163 df_linktable = load_ccm_links() 164 df_funda = load_compustat_funda() 165 166 # Link the datasets 167 linked_data = link_Funda(df_linktable, df_funda) 168 # Result contains fundamental data with PERMNO identifiers added 169 ``` 170 171 # Notes 172 Uses FlexiJoins.innerjoin for temporal joining capabilities 173 Only records with valid links during the observation date are retained 174 """ 175 function link_Funda(df_linktable::DataFrame, df_funda::DataFrame, 176 variables::Vector{Symbol}=Symbol[]) 177 178 funda_link_permno = FlexiJoins.innerjoin( 179 (select(df_funda, :gvkey, :datadate), df_linktable), 180 by_key(:gvkey) & by_pred(:datadate, ∈, x->x.linkdt..x.linkenddt) ) 181 182 variables_to_remove = vcat(:gvkey_1, 183 setdiff([:linkprim, :liid, :linktype, :linkdt, :linkenddt], variables) ) 184 185 select!(funda_link_permno, 186 Not(variables_to_remove)) 187 188 return funda_link_permno 189 190 end 191 # ------------------------------------------------------------------------------------------ 192 193 194 195 # ------------------------------------------------------------------------------------------ 196 """ 197 link_MSF(df_linktable::DataFrame, df_msf::DataFrame; variables::Vector{Symbol}=Symbol[]) 198 199 Link CRSP monthly stock file data with Compustat identifiers using a linking table. 200 201 This function performs a temporal join to add GVKEY (Compustat identifiers) to CRSP monthly 202 stock data, enabling cross-database analysis between CRSP and Compustat datasets. 203 204 # Arguments 205 - `df_linktable::DataFrame`: Linking table containing the mapping between PERMNO and GVKEY 206 identifiers. Must include columns: 207 - `:permno`: CRSP's permanent security identifier 208 - `:gvkey`: Compustat's permanent company identifier 209 - `:linkdt`: Start date of the link validity period 210 - `:linkenddt`: End date of the link validity period 211 212 - `df_msf::DataFrame`: CRSP monthly stock file data. Must include columns: 213 - `:permno`: CRSP's permanent security identifier 214 - `:date`: Date of the stock observation 215 - Additional columns as specified in `variables` (if any) 216 217 # Keyword Arguments 218 - `variables::Vector{Symbol}=Symbol[]`: Optional list of additional columns to retain from 219 the linking process. Only columns that exist in both datasets will be kept. 220 221 # Returns 222 - `DataFrame`: Original CRSP data with GVKEY identifiers added where valid links exist. 223 Includes: 224 - All original columns from `df_msf` 225 - `:gvkey`: Compustat identifier (where available) 226 - `:datey`: Year extracted from the `:date` column 227 - Any additional columns specified in `variables` that exist in the joined data 228 229 # Details 230 The function performs a two-step process: 231 1. **Inner join with temporal filtering**: Matches CRSP records to the linking table where 232 the stock date falls within the valid link period `[linkdt, linkenddt]` 233 2. **Left join back to original data**: Ensures all original CRSP records are retained, 234 with GVKEY values added only where valid links exist 235 236 Records with missing GVKEY values after the initial join are filtered out before the 237 merge-back step, ensuring only valid links are propagated. 238 239 # Examples 240 ```julia 241 # Load data 242 df_linktable = load_ccm_links() 243 df_msf = load_crsp_monthly() 244 245 # Basic linking 246 linked_msf = link_MSF(df_linktable, df_msf) 247 248 # Include additional variables from the linking table 249 linked_msf = link_MSF(df_linktable, df_msf, variables=[:linkprim, :linktype]) 250 ``` 251 """ 252 function link_MSF(df_linktable::DataFrame, df_msf::DataFrame; 253 variables::Vector{Symbol}=Symbol[]) 254 255 # Merge with CRSP 256 df_msf_linked = FlexiJoins.innerjoin( 257 (df_msf, df_linktable), 258 by_key(:permno) & by_pred(:date, ∈, x->x.linkdt..x.linkenddt) 259 ) 260 @p df_msf_linked |> filter!(.!ismissing.(_.gvkey)) 261 col_keep = vcat([:date, :permno, :gvkey], intersect(variables, propertynames(df_msf_linked))) |> unique 262 select!(df_msf_linked, col_keep) 263 264 # merge this back 265 df_msf_linked = leftjoin(df_msf, df_msf_linked, on = [:date, :permno], source="_merge") 266 transform!(df_msf_linked, :date => ByRow(year) => :datey) 267 select!(df_msf_linked, Not(:_merge)) 268 269 270 return df_msf_linked 271 end 272 # ------------------------------------------------------------------------------------------ 273 274 275 276 # ------------------------------------------------------------------------------------------ 277 # function link_ccm(df_linktable, df_msf, df_funda) 278 279 # # ccm 280 # df_ccm = leftjoin( 281 # df_msf_merged, df_funda, 282 # on = [:gvkey, :datey], matchmissing = :notequal) 283 284 # if save 285 # CSV.write("./tmp/ccm.csv.gz", df_ccm, compress=true) 286 # end 287 288 # end 289 # ------------------------------------------------------------------------------------------