BazerUtils.jl

Assorted Julia utilities including custom logging
Log | Files | Refs | README | LICENSE

commit 745bc554fe070d08001219e7c24b052672fe7acd
parent 867be70d3424213b1250e942d91400be4017ad0a
Author: Erik Loualiche <[email protected]>
Date:   Sun, 15 Feb 2026 22:53:26 -0600

deprecate JSONL functions in favor of JSON.jl v1

- Add Base.depwarn to read_jsonl, stream_jsonl, write_jsonl pointing
  users to JSON.parse/JSON.json with jsonlines=true
- Remove @show T debug statement left in read_jsonl
- Fix write_jsonl crashing on files in current directory (dirname("") bug)
- Trim docstrings and add deprecation admonitions
- Update JSONL doc page with deprecation banner

Co-Authored-By: Claude Opus 4.6 <[email protected]>

Diffstat:
Mdocs/src/man/read_jsonl.md | 108++++++++++++++-----------------------------------------------------------------
Msrc/JSONLines.jl | 121++++++++++++++++++++-----------------------------------------------------------
2 files changed, 49 insertions(+), 180 deletions(-)

diff --git a/docs/src/man/read_jsonl.md b/docs/src/man/read_jsonl.md @@ -1,5 +1,13 @@ # Working with JSON Lines Files +!!! warning "Deprecated" + The JSONL functions in BazerUtils (`read_jsonl`, `stream_jsonl`, `write_jsonl`) are deprecated. + Use [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead, which has native support: + ```julia + using JSON + data = JSON.parse("data.jsonl"; jsonlines=true) # read + JSON.json("out.jsonl", data; jsonlines=true) # write + ``` --- @@ -7,21 +15,11 @@ > JSON Lines (JSONL) is a convenient format for storing structured data that may be processed one record at a time. Each line is a valid JSON value, separated by a newline character. This format is ideal for large datasets and streaming applications. -- **UTF-8 Encoding:** Files must be UTF-8 encoded. Do not include a byte order mark (BOM). -- **One JSON Value Per Line:** Each line is a valid JSON value (object, array, string, number, boolean, or null). Blank lines are ignored. -- **Line Separator:** Each line ends with `\n` (or `\r\n`). The last line may or may not end with a newline. - - For more details, see [jsonlines.org](https://jsonlines.org/). -This is a personal implementation and is not tested for any sort of standard. -It works fine for my usecase and I try to fix things as I encounter them, but ymmv. - --- -## Reading JSON Lines Files - -You can use the `read_jsonl` and `stream_jsonl` functions to read JSONL files or streams. +## Legacy API (deprecated) ### `read_jsonl` @@ -29,102 +27,34 @@ Reads the entire file or stream into memory and returns a vector of parsed JSON ```julia using BazerUtils -import JSON3 data = read_jsonl("data.jsonl") -# or from an IOBuffer -buf = data = read_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n")) data = read_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n"); dict_of_json=true) ``` - -- **Arguments:** `source::Union{AbstractString, IO}` -- **Returns:** `Vector` of parsed JSON values -- **Note:** Loads all data into memory. For large files, use `stream_jsonl`. - ---- - - ### `stream_jsonl` -Creates a lazy iterator (Channel) that yields one parsed JSON value at a time, without loading the entire file into memory. - -```julia -stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n")) -data = collect(stream) -BazerUtils._dict_of_json3.(data) - -stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]")) -collect(stream) # error because types of vector elements are not all JSON3.Object{} -stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]"), T=Any) -collect(stream) # default to Vector{Any} - -stream = stream_jsonl(IOBuffer("[4,5,6]\n[1,2,3]"), T= JSON3.Array{}) -collect(stream) -stream = stream_jsonl(IOBuffer("4\n1"), T=Int) -collect(stream) -``` +Creates a lazy iterator (Channel) that yields one parsed JSON value at a time. -Allows iterators ```julia -first10 = collect(Iterators.take(stream_jsonl("data.jsonl"), 10)) # Collect the first 10 records -# see tests for other iterators ... +for record in stream_jsonl("data.jsonl") + println(record) +end +first10 = collect(Iterators.take(stream_jsonl("data.jsonl"), 10)) ``` +### `write_jsonl` -- **Arguments:** `source::Union{AbstractString, IO}` -- **Returns:** `Channel` (iterator) of parsed JSON values -- **Note:** Ideal for large files and streaming workflows. - ---- - -## Writing JSON Lines Files - -Use `write_jsonl` to write an iterable of JSON-serializable values to a JSONL file. +Write an iterable of JSON-serializable values to a JSONL file. ```julia write_jsonl("out.jsonl", [Dict("a"=>1), Dict("b"=>2)]) write_jsonl("out.jsonl.gz", (Dict("i"=>i) for i in 1:100); compress=true) ``` -- **Arguments:** - - `filename::AbstractString` - - `data`: iterable of JSON-serializable values - - `compress::Bool=false`: write gzip-compressed if true or filename ends with `.gz` -- **Returns:** The filename - ---- - - -## Example: Roundtrip with IOBuffer - -Note that there is no stable roundtrip between read and write, because of the way `JSON3` processes record into dictionaries and even when we add the dict flag it is `Symbol => Any` - -```julia -data_string = [Dict("a"=>1), Dict("b"=>2)] -data_symbol = [Dict(:a=>1), Dict(:b=>2)] - -function roundtrip(data) - buf = IOBuffer() - for obj in data - JSON3.write(buf, obj) - write(buf, '\n') - end - seekstart(buf) - return read_jsonl(buf; dict_of_json=true) -end - -roundtrip(data_string) == data_string -roundtrip(data_symbol) == data_symbol -``` - --- ## See Also -- [`JSON3.jl`](https://github.com/quinnj/JSON3.jl): Fast, flexible JSON parsing and serialization for Julia. -- [`CodecZlib.jl`](https://github.com/JuliaIO/CodecZlib.jl): Gzip compression support. - ---- - -For more advanced usage, see the function docstrings or the test suite. - \ No newline at end of file +- [`JSON.jl`](https://github.com/JuliaIO/JSON.jl): The recommended replacement. Use `jsonlines=true` for JSONL support. +- [`CodecZlib.jl`](https://github.com/JuliaIO/CodecZlib.jl): Gzip compression support.+ \ No newline at end of file diff --git a/src/JSONLines.jl b/src/JSONLines.jl @@ -16,46 +16,23 @@ """ read_jsonl(source::Union{AbstractString, IO}; dict_of_json::Bool=false) -> Vector +!!! warning "Deprecated" + `read_jsonl` is deprecated. Use `JSON.parse(source; jsonlines=true)` from + [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead. + Read a JSON Lines (.jsonl) file or stream and return all records as a vector. -This function reads the entire file or IO stream into memory at once, parsing each line as a separate -JSON value. Empty lines are automatically skipped. +Each line is parsed as a separate JSON value. Empty lines are skipped. # Arguments -- `source::Union{AbstractString, IO}`: Path to the JSON Lines file to read, or an IO stream (e.g., IOBuffer, file handle). +- `source::Union{AbstractString, IO}`: Path to a JSONL file, or an IO stream. - `dict_of_json::Bool=false`: If `true` and the parsed type is `JSON3.Object`, convert each record to a `Dict{Symbol,Any}`. # Returns -- `Vector`: A vector containing all parsed JSON values from the file or stream. - -# Examples -```julia -# Read all records from a JSONL file -data = read_jsonl("data.jsonl") - -# Read from an IOBuffer -buf = IOBuffer("$(JSON3.write(Dict(:a=>1)))\n$(JSON3.write(Dict(:a=>2)))\n") -data = read_jsonl(buf) - -# Convert JSON3.Object records to Dict -data = read_jsonl("data.jsonl"; dict_of_json=true) - -# Access individual records -first_record = data[1] -println("First record ID: ", first_record.id) -``` - -# Notes -- This function loads all data into memory, so it may not be suitable for very large files. -- For large files, consider using `stream_jsonl()` for streaming processing. -- The function will throw an error if the JSON on any line is malformed. -- The path must refer to an existing regular file. -- If `dict_of_json=true`, all records must be of type `JSON3.Object`. - -# See Also -- [`stream_jsonl`](@ref): For memory-efficient streaming of large JSONL files. +- `Vector`: A vector of parsed JSON values. """ function read_jsonl(io::IO; dict_of_json::Bool=false) + Base.depwarn("`read_jsonl` is deprecated. Use `JSON.parse(io; jsonlines=true)` from JSON.jl v1 instead.", :read_jsonl) lines = collect(eachline(io)) nonempty_lines = filter(l -> !isempty(strip(l)), lines) isempty(nonempty_lines) && return [] @@ -68,7 +45,6 @@ function read_jsonl(io::IO; dict_of_json::Bool=false) for (i, line) in enumerate(nonempty_lines[2:end]) results[i+1] = JSON3.read(line) end - @show T if dict_of_json && T <: JSON3.Object{} results = [_dict_of_json3(r) for r in results] end @@ -93,59 +69,21 @@ end """ stream_jsonl(source::Union{AbstractString, IO}; T::Type=JSON3.Object{}) -> Channel -Create a lazy iterator (Channel) for reading JSON Lines files record by record. +!!! warning "Deprecated" + `stream_jsonl` is deprecated. Use `JSON.parse(source; jsonlines=true)` from + [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead. -This function returns a Channel that yields JSON objects one at a time without loading -the entire file into memory. This is memory-efficient for processing large JSONL files. -Each parsed record is checked to match the specified type `T` (default: `JSON3.Object{}`). -If a record does not match `T`, an error is thrown. +Create a lazy Channel iterator for reading JSON Lines files record by record. # Arguments -- `source::Union{AbstractString, IO}`: Path to the JSON Lines file to read, or an IO stream (e.g., IOBuffer, file handle). -- `T::Type=JSON3.Object{}`: The expected type for each parsed record. Use `T=Any` to allow mixed types. +- `source::Union{AbstractString, IO}`: Path to a JSONL file, or an IO stream. +- `T::Type=JSON3.Object{}`: Expected type for each record. Use `T=Any` for mixed types. # Returns -- `Channel{T}`: A channel that yields parsed JSON objects one at a time. - -# Examples -```julia -# Process records one at a time (memory efficient) -for record in stream_jsonl("large_file.jsonl") - println("Processing record: ", record.id) -end - -# Collect first N records -first_10 = collect(Iterators.take(stream_jsonl("data.jsonl"), 10)) - -# Filter and process -filtered_records = [r for r in stream_jsonl("data.jsonl") if r.score > 0.5] - -# Stream from an IOBuffer -buf = IOBuffer("$(JSON3.write(Dict(:a=>1)))\n$(JSON3.write(Dict(:a=>2)))\n") -for record in stream_jsonl(buf) - @show record -end - -# Allow mixed types -for record in stream_jsonl("data.jsonl"; T=Any) - @show record -end -``` - -# Notes -- This is a lazy iterator: records are only read and parsed when requested. -- Memory usage remains constant regardless of file size. -- Empty lines are automatically skipped. -- The Channel is automatically closed when the file or stream is fully read or an error occurs. -- If JSON parsing fails on any line, the Channel will close and propagate the error. -- For file paths, the file remains open for the lifetime of the channel. -- For IO streams, the user is responsible for keeping the IO open while consuming the channel. -- If a parsed record does not match `T`, an error is thrown. Use `T=Any` to allow mixed types. - -# See Also -- [`read_jsonl`](@ref): For loading entire JSONL files into memory at once. +- `Channel{T}`: A channel yielding parsed JSON objects one at a time. """ function stream_jsonl(io::IO; T::Type=JSON3.Object{}) + Base.depwarn("`stream_jsonl` is deprecated. Use `JSON.parse(io; jsonlines=true)` from JSON.jl v1 instead.", :stream_jsonl) lines = Iterators.filter(l -> !isempty(strip(l)), eachline(io)) return Channel{T}() do ch for line in lines @@ -160,6 +98,7 @@ end function stream_jsonl(filename::AbstractString; T::Type=JSON3.Object{}) + Base.depwarn("`stream_jsonl` is deprecated. Use `JSON.parse(filename; jsonlines=true)` from JSON.jl v1 instead.", :stream_jsonl) if !isfile(filename) throw(ArgumentError("File does not exist or is not a regular file: $filename")) end @@ -197,29 +136,30 @@ end function write_jsonl(filename::AbstractString, data; kwargs...) + Base.depwarn("`write_jsonl` is deprecated. Use `JSON.json(filename, data; jsonlines=true)` from JSON.jl v1 instead.", :write_jsonl) write_jsonl(filename, data, iteration_style(data); kwargs...) end """ write_jsonl(filename, data; compress=false) -Write an iterable of JSON-serializable values to a JSON Lines file. +!!! warning "Deprecated" + `write_jsonl` is deprecated. Use `JSON.json(filename, data; jsonlines=true)` from + [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead. -- `filename`: Output file path (if ends with `.gz` or `compress=true`, writes gzip-compressed) -- `data`: An iterable (e.g., Vector, generator) of values (Dict, Array, String, Number, Bool, nothing, etc.) +Write an iterable of JSON-serializable values to a JSON Lines file. -Returns the filename. +# Arguments +- `filename`: Output file path (writes gzip-compressed if ends with `.gz` or `compress=true`) +- `data`: An iterable of JSON-serializable values +- `compress::Bool=false`: Force gzip compression -# Example -```julia -write_jsonl("out.jsonl", [Dict("a"=>1), Dict("b"=>2)]) -write_jsonl("out.jsonl.gz", (Dict("i"=>i) for i in 1:10^6)) -``` +# Returns +The filename. """ function write_jsonl(filename::AbstractString, data, ::TableIteration; compress::Bool=false) - # @warn "Implementation for tables" dir = dirname(filename) - if !isdir(dir) + if !isempty(dir) && !isdir(dir) throw(ArgumentError("Directory does not exist: $dir")) end isgz = compress || endswith(filename, ".gz") @@ -237,9 +177,8 @@ function write_jsonl(filename::AbstractString, data, ::TableIteration; compress: end function write_jsonl(filename::AbstractString, data, ::DirectIteration; compress::Bool=false) - # @warn "Implementation for direct iteration" dir = dirname(filename) - if !isdir(dir) + if !isempty(dir) && !isdir(dir) throw(ArgumentError("Directory does not exist: $dir")) end isgz = compress || endswith(filename, ".gz")