commit 4adb77fa5e631a146a12a939aca03b010737507a
parent 250d7d372328d10b7f0fe5de9c1a047e88c10d91
Author: Erik Loualiche <[email protected]>
Date: Thu, 19 Jun 2025 19:36:52 -0500
fix some bugs in dataframe interface to jsonlines ... more general with tables
Diffstat:
4 files changed, 52 insertions(+), 5 deletions(-)
diff --git a/Project.toml b/Project.toml
@@ -9,6 +9,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
[compat]
CodecZlib = "0.7.8"
@@ -16,6 +17,7 @@ Dates = "1.11.0"
JSON3 = "1.14.3"
Logging = "1.11.0"
LoggingExtras = "1.1.0"
+Tables = "1.12.1"
julia = "1.6.7"
[extras]
diff --git a/src/BazerUtils.jl b/src/BazerUtils.jl
@@ -7,6 +7,7 @@ import Logging: global_logger, Logging, Logging.Debug, Logging.Info, Logging.War
import LoggingExtras: ConsoleLogger, EarlyFilteredLogger, FileLogger, FormatLogger,
MinLevelLogger, TeeLogger, TransformerLogger
import JSON3: JSON3
+import Tables: Tables
import CodecZlib: CodecZlib
# --------------------------------------------------------------------------------------------------
diff --git a/src/JSONLines.jl b/src/JSONLines.jl
@@ -182,6 +182,24 @@ end
# --------------------------------------------------------------------------------------------------
+abstract type IterationStyle end
+struct TableIteration <: IterationStyle end
+struct DirectIteration <: IterationStyle end
+
+function iteration_style(x)
+ # Only use table iteration for proper table types
+ if (Tables.istable(x) && !isa(x, AbstractVector) && !isa(x, AbstractDict))
+ TableIteration()
+ else
+ DirectIteration()
+ end
+end
+
+
+function write_jsonl(filename::AbstractString, data; kwargs...)
+ write_jsonl(filename, data, iteration_style(data); kwargs...)
+end
+
"""
write_jsonl(filename, data; compress=false)
@@ -198,7 +216,8 @@ write_jsonl("out.jsonl", [Dict("a"=>1), Dict("b"=>2)])
write_jsonl("out.jsonl.gz", (Dict("i"=>i) for i in 1:10^6))
```
"""
-function write_jsonl(filename::AbstractString, data; compress::Bool=false)
+function write_jsonl(filename::AbstractString, data, ::TableIteration; compress::Bool=false)
+ # @warn "Implementation for tables"
dir = dirname(filename)
if !isdir(dir)
throw(ArgumentError("Directory does not exist: $dir"))
@@ -207,7 +226,7 @@ function write_jsonl(filename::AbstractString, data; compress::Bool=false)
openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w")
io = openf(filename)
try
- for value in data
+ for value in Tables.namedtupleiterator(data)
JSON3.write(io, value)
write(io, '\n')
end
@@ -217,9 +236,24 @@ function write_jsonl(filename::AbstractString, data; compress::Bool=false)
return filename
end
-function write_jsonl(filename::AbstractString, data::AbstractDataFrame; kwargs...)
- row_tuples = (NamedTuple(row) for row in eachrow(data))
- write_jsonl(filename, row_tuples; kwargs...)
+function write_jsonl(filename::AbstractString, data, ::DirectIteration; compress::Bool=false)
+ # @warn "Implementation for direct iteration"
+ dir = dirname(filename)
+ if !isdir(dir)
+ throw(ArgumentError("Directory does not exist: $dir"))
+ end
+ isgz = compress || endswith(filename, ".gz")
+ openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w")
+ io = openf(filename)
+ try
+ for value in data
+ JSON3.write(io, value)
+ write(io, '\n')
+ end
+ finally
+ close(io)
+ end
+ return filename
end
# --------------------------------------------------------------------------------------------------
diff --git a/test/UnitTests/jsonlines.jl b/test/UnitTests/jsonlines.jl
@@ -163,6 +163,16 @@ end
@test BazerUtils._dict_of_json3.(gz_data) == data_dict
# @assert gz_data == data
+ jsonl_file = tempname() * ".jsonl"
+ simple_table = [
+ (id=1, name="Alice", age=30),
+ (id=2, name="Bob", age=25),
+ (id=3, name="Charlie", age=35)
+ ]
+ write_jsonl(jsonl_file, simple_table)
+ simple_dict = read_jsonl(jsonl_file)
+ @test BazerUtils._dict_of_json3.(simple_dict) == map(row -> Dict(pairs(row)), simple_table)
+
end
# --------------------------------------------------------------------------------------------------