baumgold commented on code in PR #325:
URL: https://github.com/apache/arrow-julia/pull/325#discussion_r894582858
##########
src/write.jl:
##########
@@ -135,16 +135,13 @@ mutable struct Writer{T<:IO}
end
function Base.open(::Type{Writer}, io::T,
compress::Union{Nothing,LZ4FrameCompressor,<:AbstractVector{LZ4FrameCompressor},ZstdCompressor,<:AbstractVector{ZstdCompressor}},
writetofile::Bool, largelists::Bool, denseunions::Bool, dictencode::Bool,
dictencodenested::Bool, alignment::Integer, maxdepth::Integer, ntasks::Integer,
meta::Union{Nothing,Any}, colmeta::Union{Nothing,Any}, closeio::Bool) where
{T<:IO}
- if ntasks < 1
- throw(ArgumentError("ntasks keyword argument must be > 0; pass
`ntasks=1` to disable multithreaded writing"))
- end
msgs = OrderedChannel{Message}(ntasks)
schema = Ref{Tables.Schema}()
firstcols = Ref{Any}()
dictencodings = Dict{Int64,Any}() # Lockable{DictEncoding}
blocks = (Block[], Block[])
# start message writing from channel
- threaded = ntasks > 1
+ threaded = Threads.nthreads() > 1
Review Comment:
Thanks for your comment. I don't think this change is breaking. In this
context, `threaded` just determines whether to use threads or coroutines. In
either case the processing of each partition is done in parallel and then
sequenced through an `OrderedChannel` to be serialized to the IO. The IO
serialization is sequential (not parallel) since order matters so no locking is
required. We should use threads when they are available, otherwise we should
use coroutines.
The `ntasks` parameter is orthogonal to the choice of threads or coroutines.
`ntasks` defines how large the buffer can be within the `OrderedChannel`. A
larger buffer means the parallel processing of partitions can outpace the
sequential serialization of the results to the IO since the processed results
can be cached in the buffer.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]