딥스탯 2018. 10. 18. 14:17
11_performance

Introduction to DataFrames

Bogumił Kamiński, Apr 21, 2018

Reference

Series

In [1]:
using DataFrames
using BenchmarkTools

Performance tips

Access by column number is faster than by name

In [2]:
x = DataFrame(rand(5, 1000))
@btime x[500];
@btime x[:x500];
  13.743 ns (0 allocations: 0 bytes)
  21.816 ns (0 allocations: 0 bytes)

When working with data DataFrame use barrier functions or type annotation

In [3]:
using Random

function f_bad() # this function will be slow
    Random.seed!(1); x = DataFrame(rand(1000000,2))
    y, z = x[1], x[2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_bad();
  107.867 ms (5999022 allocations: 122.06 MiB)
In [4]:
@code_warntype f_bad() # the reason is that Julia does not know the types of columns in `DataFrame`
Body::Any
│╻            seed!4  1 ── %1  = Random.GLOBAL_RNG::MersenneTwister
││╻╷╷╷         seed!   │    %2  = $(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{UInt32,1}, svec(Any, Int64), :(:ccall), 2, Array{UInt32,1}, 0, 0))::Array{UInt32,1}
│││╻╷╷╷╷╷╷╷     make_seed   │    %3  = (Core.lshr_int)(1, 63)::Int64
││││┃│││││││     push!   │    %4  = (Core.trunc_int)(Core.UInt8, %3)::UInt8
│││││┃││││││      _growend!   │    %5  = (Core.eq_int)(%4, 0x01)::Bool
││││││┃││││        cconvert   └───       goto #3 if not %5
│││││││┃│││         convert   2 ──       invoke Core.throw_inexacterror(:check_top_bit::Symbol, Int64::Any, 1::Int64)
││││││││┃││          Type   └───       $(Expr(:unreachable))
│││││││││┃│           toUInt64   3 ──       goto #4
││││││││││      4 ── %10 = (Core.bitcast)(Core.UInt64, 1)::UInt64
││││││││││      └───       goto #5
│││││││││       5 ──       goto #6
││││││││        6 ──       goto #7
│││││││         7 ──       goto #8
││││││          8 ──       $(Expr(:foreigncall, :(:jl_array_grow_end), Nothing, svec(Any, UInt64), :(:ccall), 2, :(%2), :(%10), :(%10)))
││││││          └───       goto #9
│││││╻╷╷╷╷        lastindex   9 ── %17 = (Base.arraysize)(%2, 1)::Int64
││││││╻╷╷╷         eachindex   │    %18 = (Base.slt_int)(%17, 0)::Bool
│││││││┃│││││       axes1   │    %19 = (Base.ifelse)(%18, 0, %17)::Int64
│││││╻            setindex!   │          (Base.arrayset)(true, %2, 0x00000001, %19)
││││╻            push!   └───       goto #10
││││╻            >>   10 ─       (Base.ifelse)(true, 0, 0)
│││╻            make_seed   └───       goto #11
│││             11 ─       invoke Random.seed!(%1::MersenneTwister, %2::Array{UInt32,1})
│││             └───       goto #12
││              12 ─       goto #13
│╻╷           rand   13 ─ %27 = Random.GLOBAL_RNG::MersenneTwister
││┃│╷╷╷        rand   │    %28 = $(Expr(:foreigncall, :(:jl_alloc_array_2d), Array{Float64,2}, svec(Any, Int64, Int64), :(:ccall), 3, Array{Float64,2}, 1000000, 2, 2, 1000000))::Array{Float64,2}
│││╻╷           rand   │    %29 = (Base.arraylen)(%28)::Int64
││││╻            rand!   │    %30 = (Base.mul_int)(8, %29)::Int64
│││││╻            rand!   │    %31 = (Base.arraylen)(%28)::Int64
││││││╻            _rand!   │    %32 = (Base.mul_int)(8, %31)::Int64
│││││││╻            <=   │    %33 = (Base.sle_int)(%30, %32)::Bool
│││││││         └───       goto #15 if not %33
│││││││         14 ─       goto #16
   15 ─       nothing
│││││││         16 ┄ %37 = φ (#14 => true, #15 => false)::Bool
│││││││         └───       goto #18 if not %37
│││││││╻            macro expansion   17 ─ %39 = $(Expr(:gc_preserve_begin, :(%28)))
││││││││╻╷           pointer   │    %40 = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), :(:ccall), 1, :(%28)))::Ptr{Float64}
││││││││╻            Type   │    %41 = %new(Random.UnsafeView{Float64}, %40, %29)::Random.UnsafeView{Float64}
││││││││        │          invoke Random.rand!(%27::MersenneTwister, %41::Random.UnsafeView{Float64}, $(QuoteNode(Random.SamplerTrivial{Random.CloseOpen01{Float64},Float64}(Random.CloseOpen01{Float64}())))::Random.SamplerTrivial{Random.CloseOpen01{Float64},Float64})
││││││││        │          $(Expr(:gc_preserve_end, :(%39)))
││││││││        └───       goto #19
│││││││╻            Type   18 ─ %45 = %new(Core.AssertionError, "sizeof(Float64) * n64 <= sizeof(T) * length(A) && isbitstype(T)")::AssertionError
│││││││         │          (Base.throw)(%45)
│││││││         └───       $(Expr(:unreachable))
││││││          19 ┄       goto #20
│││││           20 ─       goto #21
││││            21 ─       goto #22
│││             22 ─       goto #23
││              23 ─       goto #24
   24 ─ %53 = Main.DataFrame::Core.Compiler.Const(DataFrame, false)
││╻            size   │    %54 = (Base.arraysize)(%28, 2)::Int64
││              │    %55 = invoke DataFrames.gennames(%54::Int64)::Array{Symbol,1}
││╻            Type   │    %56 = invoke DataFrames.:(#DataFrame#60)(false::Bool, %53::Type, %28::Array{Float64,2}, %55::Array{Symbol,1})::DataFrame
│╻╷           getindex5  │    %57 = (DataFrames.getfield)(%56, :columns)::Array{AbstractArray{T,1} where T,1}
││              │    %58 = π (1, Int64)
││╻            getindex   │    %59 = (Base.arrayref)(true, %57, %58)::AbstractArray{T,1} where T
││╻            columns   │    %60 = (DataFrames.getfield)(%56, :columns)::Array{AbstractArray{T,1} where T,1}
││              │    %61 = π (2, Int64)
││╻            getindex   │    %62 = (Base.arrayref)(true, %60, %61)::AbstractArray{T,1} where T
7  │    %63 = invoke Main.nrow(%56::DataFrame)::Int64
│╻╷╷╷         Colon   │    %64 = (Base.sle_int)(1, %63)::Bool
││╻            Type   │          (Base.sub_int)(%63, 1)
│││┃            unitrange_last   │    %66 = (Base.ifelse)(%64, %63, 0)::Int64
││╻╷╷          isempty   │    %67 = (Base.slt_int)(%66, 1)::Bool
││              └───       goto #26 if not %67
││              25 ─       goto #27
││              26 ─       goto #27
   27 ┄ %71 = φ (#25 => true, #26 => false)::Bool
   │    %72 = φ (#26 => 1)::Int64
   │    %73 = φ (#26 => 1)::Int64
   │    %74 = (Base.not_int)(%71)::Bool
   └───       goto #33 if not %74
   28 ┄ %76 = φ (#27 => 0.0, #32 => %82)::Any
   │    %77 = φ (#27 => %72, #32 => %88)::Int64
   │    %78 = φ (#27 => %73, #32 => %89)::Int64
8  │    %79 = (Base.getindex)(%59, %77)::Any
   │    %80 = (Base.getindex)(%62, %77)::Any
   │    %81 = (%79 * %80)::Any
   │    %82 = (%76 + %81)::Any
││╻            ==   │    %83 = (%78 === %66)::Bool
││              └───       goto #30 if not %83
││              29 ─       goto #31
││╻            +   30 ─ %86 = (Base.add_int)(%78, 1)::Int64
│╻            iterate   └───       goto #31
   31 ┄ %88 = φ (#30 => %86)::Int64
   │    %89 = φ (#30 => %86)::Int64
   │    %90 = φ (#29 => true, #30 => false)::Bool
   │    %91 = (Base.not_int)(%90)::Bool
   └───       goto #33 if not %91
   32 ─       goto #28
10 33 ─ %94 = φ (#31 => %82, #27 => 0.0)::Any
   └───       return %94
In [5]:
# solution 1 is to use barrier function (it should be possible to use it in almost any code)
function f_inner(y,z)
   p = 0.0
   for i in 1:length(y)
       p += y[i]*z[i]
   end
   p
end

function f_barrier() # extract the work to an inner function
    Random.seed!(1); x = DataFrame(rand(1000000,2))
    f_inner(x[1], x[2])
end

using LinearAlgebra

function f_inbuilt() # or use inbuilt function if possible
    Random.seed!(1); x = DataFrame(rand(1000000,2))
    x[1]  x[2]
end

@btime f_barrier();
@btime f_inbuilt();
  8.436 ms (44 allocations: 30.52 MiB)
  9.642 ms (44 allocations: 30.52 MiB)
In [6]:
# solution 2 is to provide the types of extracted columns
# it is simpler but there are cases in which you will not know these types
function f_typed()
    Random.seed!(1); x = DataFrame(rand(1000000,2))
    y::Vector{Float64}, z::Vector{Float64} = x[1], x[2]
    p = 0.0
    for i in 1:nrow(x)
        p += y[i]*z[i]
    end
    p
end

@btime f_typed();
  8.464 ms (44 allocations: 30.52 MiB)

Consider using delayed DataFrame creation technique

In [7]:
function f1()
    x = DataFrame(Float64, 10^4, 100) # we work with DataFrame directly
    for c in 1:ncol(x)
        d = x[c]
        for r in 1:nrow(x)
            d[r] = rand()
        end
    end
    x
end

function f2()
    x = Vector{Any}(undef,100)
    for c in 1:length(x)
        d = Vector{Float64}(undef,10^4)
        for r in 1:length(d)
            d[r] = rand()
        end
        x[c] = d
    end
    DataFrame(x) # we delay creation of DataFrame after we have our job done
end

@btime f1();
@btime f2();
  22.924 ms (1950037 allocations: 37.42 MiB)
  2.098 ms (937 allocations: 7.69 MiB)

You can add rows to a DataFrame in place and it is fast

  • But I don't know why the sizes changes. There is no explanation in the original text.
In [8]:
x = DataFrame(rand(10^6, 5))
y = DataFrame(transpose(1.0:5.0))
z = [1.0:5.0;]
println("Size of original x = ",size(x))
@btime vcat($x, $y); # creates a new DataFrame - slow
println("Size of result after running vcat = ", size(vcat(x,y)))
@btime push!($x, $z); # add a single row in place - fast
println("Size of x after running push! = ", size(x))
println(" ")
x = DataFrame(rand(10^6, 5)) # reset to the same starting point
println("Size of original x = ", size(x))
@btime append!($x, $y); # in place - fastest
println("Size of x after running append! = ", size(x))
Size of original x = (1000000, 5)
  6.643 ms (135 allocations: 38.15 MiB)
Size of result after running vcat = (1000001, 5)
  204.251 ns (5 allocations: 80 bytes)
Size of x after running push! = (7350502, 5)
 
Size of original x = (1000000, 5)
  164.216 ns (1 allocation: 16 bytes)
Size of x after running append! = (9260502, 5)

Allowing missing as well as categorical slows down computations

In [9]:
using StatsBase

function test(data) # uses countmap function to test performance
    println(eltype(data))
    x = rand(data, 10^6)
    y = categorical(x)
    println(" raw:")
    @btime countmap($x)
    println(" categorical:")
    @btime countmap($y)
    nothing
end

println("Using test(1:10)")
test(1:10)
println(" ")
println("Using test([randstring() for i in 1:10])")
test([randstring() for i in 1:10])
println(" ")
println("Using test(allowmissing(1:10))")
test(allowmissing(1:10))
println(" ")
println("Using test(allowmissing([randstring() for i in 1:10]))")
test(allowmissing([randstring() for i in 1:10]))
Using test(1:10)
Int64
 raw:
  5.340 ms (8 allocations: 7.63 MiB)
 categorical:
  20.467 ms (4 allocations: 608 bytes)
 
Using test([randstring() for i in 1:10])
String
 raw:
  33.041 ms (4 allocations: 608 bytes)
 categorical:
  38.489 ms (4 allocations: 608 bytes)
 
Using test(allowmissing(1:10))
Union{Missing, Int64}
 raw:
  13.648 ms (4 allocations: 624 bytes)
 categorical:
  20.305 ms (4 allocations: 608 bytes)
 
Using test(allowmissing([randstring() for i in 1:10]))
Union{Missing, String}
 raw:
  19.645 ms (4 allocations: 608 bytes)
 categorical:
  29.604 ms (4 allocations: 608 bytes)