JSONLines.jl (8033B)
1 # -------------------------------------------------------------------------------------------------- 2 3 # JSONLines.jl 4 5 # Function to naturally parse json lines files 6 # -------------------------------------------------------------------------------------------------- 7 8 9 # -------------------------------------------------------------------------------------------------- 10 # Exported function 11 # JSONLines 12 # -------------------------------------------------------------------------------------------------- 13 14 15 # -------------------------------------------------------------------------------------------------- 16 """ 17 read_jsonl(source::Union{AbstractString, IO}; dict_of_json::Bool=false) -> Vector 18 19 !!! warning "Deprecated" 20 `read_jsonl` is deprecated. Use `JSON.parse(source; jsonlines=true)` from 21 [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead. 22 23 Read a JSON Lines (.jsonl) file or stream and return all records as a vector. 24 25 Each line is parsed as a separate JSON value. Empty lines are skipped. 26 27 # Arguments 28 - `source::Union{AbstractString, IO}`: Path to a JSONL file, or an IO stream. 29 - `dict_of_json::Bool=false`: If `true` and the parsed type is `JSON.Object`, convert each record to a `Dict{Symbol,Any}`. 30 31 # Returns 32 - `Vector`: A vector of parsed JSON values. 33 """ 34 function read_jsonl(io::IO; dict_of_json::Bool=false) 35 Base.depwarn("`read_jsonl` is deprecated. Use `JSON.parse(io; jsonlines=true)` from JSON.jl v1 instead.", :read_jsonl) 36 lines = collect(eachline(io)) 37 nonempty_lines = filter(l -> !isempty(strip(l)), lines) 38 isempty(nonempty_lines) && return [] 39 40 first_val = JSON.parse(nonempty_lines[1]) 41 T = typeof(first_val) 42 results = Vector{T}(undef, length(nonempty_lines)) 43 results[1] = first_val 44 45 for (i, line) in enumerate(nonempty_lines[2:end]) 46 results[i+1] = JSON.parse(line) 47 end 48 if dict_of_json && T <: JSON.Object 49 results = [_dict_of_json(r) for r in results] 50 end 51 52 return results 53 end 54 55 function read_jsonl(filename::AbstractString; kwargs...) 56 if !isfile(filename) 57 throw(ArgumentError("File does not exist or is not a regular file: $filename")) 58 end 59 open(filename, "r") do io 60 return read_jsonl(io; kwargs...) 61 end 62 end 63 # -------------------------------------------------------------------------------------------------- 64 65 66 # -------------------------------------------------------------------------------------------------- 67 # Using lazy evaluation with generators 68 # For very large files, you can create a generator that yields records on demand: 69 """ 70 stream_jsonl(source::Union{AbstractString, IO}; T::Type=JSON.Object{String, Any}) -> Channel 71 72 !!! warning "Deprecated" 73 `stream_jsonl` is deprecated. Use `JSON.parse(source; jsonlines=true)` from 74 [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead. 75 76 Create a lazy Channel iterator for reading JSON Lines files record by record. 77 78 # Arguments 79 - `source::Union{AbstractString, IO}`: Path to a JSONL file, or an IO stream. 80 - `T::Type=JSON.Object{String, Any}`: Expected type for each record. Use `T=Any` for mixed types. 81 82 # Returns 83 - `Channel{T}`: A channel yielding parsed JSON objects one at a time. 84 """ 85 function stream_jsonl(io::IO; T::Type=JSON.Object{String, Any}) 86 Base.depwarn("`stream_jsonl` is deprecated. Use `JSON.parse(io; jsonlines=true)` from JSON.jl v1 instead.", :stream_jsonl) 87 lines = Iterators.filter(l -> !isempty(strip(l)), eachline(io)) 88 return Channel{T}() do ch 89 for line in lines 90 val = JSON.parse(line) 91 if !isa(val, T) 92 throw(ArgumentError("Parsed value of type $(typeof(val)) does not match expected type $T;\nTry specifying T::Any")) 93 end 94 put!(ch, val) 95 end 96 end 97 end 98 99 100 function stream_jsonl(filename::AbstractString; T::Type=JSON.Object{String, Any}) 101 Base.depwarn("`stream_jsonl` is deprecated. Use `JSON.parse(filename; jsonlines=true)` from JSON.jl v1 instead.", :stream_jsonl) 102 if !isfile(filename) 103 throw(ArgumentError("File does not exist or is not a regular file: $filename")) 104 end 105 return Channel{T}() do ch 106 open(filename, "r") do io 107 for line in eachline(io) 108 if isempty(strip(line)) 109 continue 110 end 111 val = JSON.parse(line) 112 if !isa(val, T) 113 throw(ArgumentError("Parsed value of type $(typeof(val)) does not match expected type $T")) 114 end 115 put!(ch, val) 116 end 117 end 118 end 119 end 120 # -------------------------------------------------------------------------------------------------- 121 122 123 # -------------------------------------------------------------------------------------------------- 124 abstract type IterationStyle end 125 struct TableIteration <: IterationStyle end 126 struct DirectIteration <: IterationStyle end 127 128 function iteration_style(x) 129 # Only use table iteration for proper table types 130 if (Tables.istable(x) && !isa(x, AbstractVector) && !isa(x, AbstractDict)) 131 TableIteration() 132 else 133 DirectIteration() 134 end 135 end 136 137 138 function write_jsonl(filename::AbstractString, data; kwargs...) 139 Base.depwarn("`write_jsonl` is deprecated. Use `JSON.json(filename, data; jsonlines=true)` from JSON.jl v1 instead.", :write_jsonl) 140 write_jsonl(filename, data, iteration_style(data); kwargs...) 141 end 142 143 """ 144 write_jsonl(filename, data; compress=false) 145 146 !!! warning "Deprecated" 147 `write_jsonl` is deprecated. Use `JSON.json(filename, data; jsonlines=true)` from 148 [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead. 149 150 Write an iterable of JSON-serializable values to a JSON Lines file. 151 152 # Arguments 153 - `filename`: Output file path (writes gzip-compressed if ends with `.gz` or `compress=true`) 154 - `data`: An iterable of JSON-serializable values 155 - `compress::Bool=false`: Force gzip compression 156 157 # Returns 158 The filename. 159 """ 160 function write_jsonl(filename::AbstractString, data, ::TableIteration; compress::Bool=false) 161 dir = dirname(filename) 162 if !isempty(dir) && !isdir(dir) 163 throw(ArgumentError("Directory does not exist: $dir")) 164 end 165 isgz = compress || endswith(filename, ".gz") 166 openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w") 167 io = openf(filename) 168 try 169 for value in Tables.namedtupleiterator(data) 170 JSON.json(io, value) 171 write(io, '\n') 172 end 173 finally 174 close(io) 175 end 176 return filename 177 end 178 179 function write_jsonl(filename::AbstractString, data, ::DirectIteration; compress::Bool=false) 180 dir = dirname(filename) 181 if !isempty(dir) && !isdir(dir) 182 throw(ArgumentError("Directory does not exist: $dir")) 183 end 184 isgz = compress || endswith(filename, ".gz") 185 openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w") 186 io = openf(filename) 187 try 188 for value in data 189 JSON.json(io, value) 190 write(io, '\n') 191 end 192 finally 193 close(io) 194 end 195 return filename 196 end 197 # -------------------------------------------------------------------------------------------------- 198 199 200 # -------------------------------------------------------------------------------------------------- 201 """ 202 _dict_of_json(obj::AbstractDict) -> Dict{Symbol, Any} 203 204 Recursively convert a parsed JSON dictionary into a `Dict` with `Symbol` keys. 205 206 All string keys are converted to `Symbol` and nested dictionaries are converted recursively. 207 Non-dict values are left unchanged. 208 """ 209 function _dict_of_json(d::AbstractDict) 210 result = Dict{Symbol, Any}() 211 for (k, v) in d 212 result[Symbol(k)] = v isa AbstractDict ? _dict_of_json(v) : v 213 end 214 return result 215 end 216 217 # Keep old name as deprecated alias 218 function _dict_of_json3(d) 219 Base.depwarn("`_dict_of_json3` is deprecated. Use `_dict_of_json` instead.", :_dict_of_json3) 220 _dict_of_json(d) 221 end 222 # --------------------------------------------------------------------------------------------------